restreamer: Prevent prom client blowing up after two different endpoints are hit

Prom client doesn't like you creating two stats with the same name,
even though they have different labels and this makes perfect sense.

I feel like I just need to re-write the prom client at some point - it doesn't actually
do all that much except get in your way, apart from the actual text encoding which I
can steal.

Anyway, in the meantime, we get around this by breaking up metrics into two names,
a "foo_all" and a "foo_ENDPOINT". The foo_all lacks the detailed labels,
but is still labelled by endpoint and can be used more easily.
The foo_ENDPOINT labels have more information but require messier PromQL as you need to
match on a name regex if you want to look at more than one specific endpoint.
pull/27/head
Mike Lang 6 years ago committed by Christopher Usher
parent 30c4bbec1d
commit c9d02b3318

@ -91,6 +91,7 @@ def metrics():
@app.route('/files/<stream>/<variant>') @app.route('/files/<stream>/<variant>')
@stats
@has_path_args @has_path_args
def list_hours(stream, variant): def list_hours(stream, variant):
"""Returns a JSON list of hours for the given stream and variant for which """Returns a JSON list of hours for the given stream and variant for which

@ -7,6 +7,35 @@ from flask import g as request_store
from monotonic import monotonic from monotonic import monotonic
# Generic metrics that all requests get logged to (see below for specific metrics per endpoint)
LATENCY_HELP = "Time taken to run the request handler and create a response"
# buckets: very long playlists / cutting can be quite slow,
# so we have a wider range of latencies than default, up to 10min.
LATENCY_BUCKETS = [.001, .005, .01, .05, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600]
generic_latency = prom.Histogram(
'http_request_latency_all', LATENCY_HELP,
['endpoint', 'method', 'status'],
buckets=LATENCY_BUCKETS,
)
SIZE_HELP = 'Size in bytes of response body for non-chunked responses'
# buckets: powers of 4 up to 1GiB (1, 4, 16, 64, 256, 1Ki, 4Ki, ...)
SIZE_BUCKETS = [4**i for i in range(16)]
generic_size = prom.Histogram(
'http_response_size_all', SIZE_HELP,
['endpoint', 'method', 'status'],
buckets=SIZE_BUCKETS,
)
CONCURRENT_HELP = 'Number of requests currently ongoing'
generic_concurrent = prom.Gauge(
'http_request_concurrency_all', CONCURRENT_HELP,
['endpoint', 'method'],
)
def stats(fn): def stats(fn):
"""Decorator that wraps a handler func to collect metrics. """Decorator that wraps a handler func to collect metrics.
Adds handler func args as labels, along with 'endpoint' label using func's name, Adds handler func args as labels, along with 'endpoint' label using func's name,
@ -15,6 +44,9 @@ def stats(fn):
# we pre-define our label names, but we don't know the names of the handler kwargs # we pre-define our label names, but we don't know the names of the handler kwargs
# until the first time the function's called. So we delay defining the metrics until # until the first time the function's called. So we delay defining the metrics until
# first call. # first call.
# In addition, it doesn't let us have different sets of labels with the same name.
# So we record everything twice: Once under a generic name with only endpoint, method
# and status, and once under a name specific to the endpoint with the full set of labels.
metrics = {} metrics = {}
endpoint = fn.__name__ endpoint = fn.__name__
@ -25,30 +57,24 @@ def stats(fn):
labels_no_status = sorted(kwargs.keys()) + ['endpoint', 'method'] labels_no_status = sorted(kwargs.keys()) + ['endpoint', 'method']
labels = labels_no_status + ['status'] labels = labels_no_status + ['status']
metrics['latency'] = prom.Histogram( metrics['latency'] = prom.Histogram(
'http_request_latency', 'http_request_latency_{}'.format(endpoint), LATENCY_HELP,
'Time taken to run the request handler and create a response', labels, buckets=LATENCY_BUCKETS,
labels,
# buckets: very long playlists / cutting can be quite slow,
# so we have a wider range of latencies than default, up to 10min.
buckets=[.001, .005, .01, .05, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600],
) )
metrics['size'] = prom.Histogram( metrics['size'] = prom.Histogram(
'http_response_size', 'http_response_size_{}'.format(endpoint), SIZE_HELP,
'Size in bytes of response body for non-chunked responses', labels, buckets=SIZE_BUCKETS,
labels,
# buckets: powers of 4 up to 1GiB (1, 4, 16, 64, 256, 1Ki, 4Ki, ...)
buckets=[4**i for i in range(16)],
) )
metrics['concurrent'] = prom.Gauge( metrics['concurrent'] = prom.Gauge(
'http_request_concurrency', 'http_request_concurrency_{}'.format(endpoint), CONCURRENT_HELP,
'Number of requests currently ongoing',
labels_no_status, labels_no_status,
) )
request_store.metrics = metrics request_store.metrics = metrics
request_store.endpoint = endpoint
request_store.method = request.method
request_store.labels = {k: str(v) for k, v in kwargs.items()} request_store.labels = {k: str(v) for k, v in kwargs.items()}
request_store.labels.update(endpoint=endpoint, method=request.method) generic_concurrent.labels(endpoint=endpoint, method=request.method).inc()
metrics['concurrent'].labels(**request_store.labels).inc() metrics['concurrent'].labels(endpoint=endpoint, method=request.method, **request_store.labels).inc()
request_store.start_time = monotonic() request_store.start_time = monotonic()
return fn(**kwargs) return fn(**kwargs)
@ -66,15 +92,20 @@ def after_request(response):
end_time = monotonic() end_time = monotonic()
metrics = request_store.metrics metrics = request_store.metrics
endpoint = request_store.endpoint
method = request_store.method
labels = request_store.labels labels = request_store.labels
start_time = request_store.start_time start_time = request_store.start_time
metrics['concurrent'].labels(**labels).dec() generic_concurrent.labels(endpoint=endpoint, method=method).dec()
metrics['concurrent'].labels(endpoint=endpoint, method=method, **labels).dec()
labels['status'] = str(response.status_code) status = str(response.status_code)
metrics['latency'].labels(**labels).observe(end_time - start_time) generic_latency.labels(endpoint=endpoint, method=method, status=status).observe(end_time - start_time)
metrics['latency'].labels(endpoint=endpoint, method=method, status=status, **labels).observe(end_time - start_time)
size = response.calculate_content_length() size = response.calculate_content_length()
if size is not None: if size is not None:
metrics['size'].labels(**labels).observe(size) generic_size.labels(endpoint=endpoint, method=method, status=status).observe(size)
metrics['size'].labels(endpoint=endpoint, method=method, status=status, **labels).observe(size)
return response return response

Loading…
Cancel
Save