restreamer: Prevent prom client blowing up after two different endpoints are hit

Prom client doesn't like you creating two stats with the same name, even though they have different labels and this makes perfect sense. I feel like I just need to re-write the prom client at some point - it doesn't actually do all that much except get in your way, apart from the actual text encoding which I can steal. Anyway, in the meantime, we get around this by breaking up metrics into two names, a "foo_all" and a "foo_ENDPOINT". The foo_all lacks the detailed labels, but is still labelled by endpoint and can be used more easily. The foo_ENDPOINT labels have more information but require messier PromQL as you need to match on a name regex if you want to look at more than one specific endpoint.
7 years ago · c9d02b3318
parent 30c4bbec1d
commit c9d02b3318
2 changed files with 51 additions and 19 deletions
--- a/restreamer/restreamer/main.py
+++ b/restreamer/restreamer/main.py
@ -91,6 +91,7 @@ def metrics():
@app.route('/files/<stream>/<variant>')
@stats
@has_path_args
 def list_hours(stream, variant):
 	"""Returns a JSON list of hours for the given stream and variant for which
--- a/restreamer/restreamer/stats.py
+++ b/restreamer/restreamer/stats.py
@ -7,6 +7,35 @@ from flask import g as request_store
 from monotonic import monotonic
 # Generic metrics that all requests get logged to (see below for specific metrics per endpoint)
 LATENCY_HELP = "Time taken to run the request handler and create a response"
 # buckets: very long playlists / cutting can be quite slow,
 # so we have a wider range of latencies than default, up to 10min.
 LATENCY_BUCKETS = [.001, .005, .01, .05, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600]
 generic_latency = prom.Histogram(
 	'http_request_latency_all', LATENCY_HELP,
 	['endpoint', 'method', 'status'],
 	buckets=LATENCY_BUCKETS,
 )
 SIZE_HELP = 'Size in bytes of response body for non-chunked responses'
 # buckets: powers of 4 up to 1GiB (1, 4, 16, 64, 256, 1Ki, 4Ki, ...)
 SIZE_BUCKETS = [4**i for i in range(16)]
 generic_size = prom.Histogram(
 	'http_response_size_all', SIZE_HELP,
 	['endpoint', 'method', 'status'],
 	buckets=SIZE_BUCKETS,
 )
 CONCURRENT_HELP = 'Number of requests currently ongoing'
 generic_concurrent = prom.Gauge(
 	'http_request_concurrency_all', CONCURRENT_HELP,
 	['endpoint', 'method'],
 )
 def stats(fn):
 	"""Decorator that wraps a handler func to collect metrics.
 	Adds handler func args as labels, along with 'endpoint' label using func's name,
@ -15,6 +44,9 @@ def stats(fn):
 	# we pre-define our label names, but we don't know the names of the handler kwargs
 	# until the first time the function's called. So we delay defining the metrics until
 	# first call.
 	# In addition, it doesn't let us have different sets of labels with the same name.
 	# So we record everything twice: Once under a generic name with only endpoint, method
 	# and status, and once under a name specific to the endpoint with the full set of labels.
 	metrics = {}
 	endpoint = fn.__name__
@ -25,30 +57,24 @@ def stats(fn):
 			labels_no_status = sorted(kwargs.keys()) + ['endpoint', 'method']
 			labels = labels_no_status + ['status']
 			metrics['latency'] = prom.Histogram(
-				'http_request_latency',
+				'http_request_latency_{}'.format(endpoint), LATENCY_HELP,
-				'Time taken to run the request handler and create a response',
+				labels, buckets=LATENCY_BUCKETS,
 				labels,
 				# buckets: very long playlists / cutting can be quite slow,
 				# so we have a wider range of latencies than default, up to 10min.
 				buckets=[.001, .005, .01, .05, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600],
 			)
 			metrics['size'] = prom.Histogram(
-				'http_response_size',
+				'http_response_size_{}'.format(endpoint), SIZE_HELP,
-				'Size in bytes of response body for non-chunked responses',
+				labels, buckets=SIZE_BUCKETS,
 				labels,
 				# buckets: powers of 4 up to 1GiB (1, 4, 16, 64, 256, 1Ki, 4Ki, ...)
 				buckets=[4**i for i in range(16)],
 			)
 			metrics['concurrent'] = prom.Gauge(
-				'http_request_concurrency',
+				'http_request_concurrency_{}'.format(endpoint), CONCURRENT_HELP,
 				'Number of requests currently ongoing',
 				labels_no_status,
 			)
 		request_store.metrics = metrics
 		request_store.endpoint = endpoint
 		request_store.method = request.method
 		request_store.labels = {k: str(v) for k, v in kwargs.items()}
-		request_store.labels.update(endpoint=endpoint, method=request.method)
+		generic_concurrent.labels(endpoint=endpoint, method=request.method).inc()
-		metrics['concurrent'].labels(**request_store.labels).inc()
+		metrics['concurrent'].labels(endpoint=endpoint, method=request.method, **request_store.labels).inc()
 		request_store.start_time = monotonic()
 		return fn(**kwargs)
@ -66,15 +92,20 @@ def after_request(response):
 	end_time = monotonic()
 	metrics = request_store.metrics
 	endpoint = request_store.endpoint
 	method = request_store.method
 	labels = request_store.labels
 	start_time = request_store.start_time
-	metrics['concurrent'].labels(**labels).dec()
+	generic_concurrent.labels(endpoint=endpoint, method=method).dec()
 	metrics['concurrent'].labels(endpoint=endpoint, method=method, **labels).dec()
-	labels['status'] = str(response.status_code)
+	status = str(response.status_code)
-	metrics['latency'].labels(**labels).observe(end_time - start_time)
+	generic_latency.labels(endpoint=endpoint, method=method, status=status).observe(end_time - start_time)
 	metrics['latency'].labels(endpoint=endpoint, method=method, status=status, **labels).observe(end_time - start_time)
 	size = response.calculate_content_length()
 	if size is not None:
-		metrics['size'].labels(**labels).observe(size)
+		generic_size.labels(endpoint=endpoint, method=method, status=status).observe(size)
 		metrics['size'].labels(endpoint=endpoint, method=method, status=status, **labels).observe(size)
 	return response