From 6bb46c582d5915f1339c98200511432d3cc51780 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Mon, 7 Jan 2019 01:28:10 -0800 Subject: [PATCH] common.stats: Add a stacksampler that records sampled stacks to prometheus This can then be used to generate flamegraphs --- common/common/__init__.py | 2 +- common/common/stats.py | 47 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/common/common/__init__.py b/common/common/__init__.py index 15978c2..0b345e3 100644 --- a/common/common/__init__.py +++ b/common/common/__init__.py @@ -8,7 +8,7 @@ import os import random from .segments import get_best_segments, parse_segment_path, SegmentInfo -from .stats import timed, PromLogCountsHandler +from .stats import timed, PromLogCountsHandler, install_stacksampler def dt_to_bustime(start, dt): diff --git a/common/common/stats.py b/common/common/stats.py index cc2394b..c01e209 100644 --- a/common/common/stats.py +++ b/common/common/stats.py @@ -1,10 +1,9 @@ -"""A place for common utilities between wubloader components""" - - +import atexit import functools import logging import os +import signal import sys import prometheus_client as prom @@ -172,3 +171,45 @@ class PromLogCountsHandler(logging.Handler): def install(cls): root_logger = logging.getLogger() root_logger.addHandler(cls()) + + +flamegraph = prom.Counter( + "flamegraph", + "Approx time consumed by each unique stack trace seen by sampling the stack", + ["stack"] +) + +def install_stacksampler(interval=0.005): + """Samples the stack every INTERVAL seconds of user time. + We could use user+sys time but that leads to interrupting syscalls, + which may affect performance, and we care mostly about user time anyway. + """ + # Note we only start each next timer once the previous timer signal has been processed. + # There are two reasons for this: + # 1. Avoid handling a signal while already handling a signal, however unlikely, + # as this could lead to a deadlock due to locking inside prometheus_client. + # 2. Avoid biasing the results by effectively not including the time taken to do the actual + # stack sampling. + + def sample(signum, frame): + stack = [] + while frame is not None: + stack.append(frame) + frame = frame.f_back + # format each frame as FUNCTION(MODULE) + stack = ";".join( + "{}({})".format(frame.f_code.co_name, frame.f_globals.get('__name__')) + for frame in stack[::-1] + ) + # increase counter by interval, so final units are in seconds + flamegraph.labels(stack).inc(interval) + # schedule the next signal + signal.setitimer(signal.ITIMER_VIRTUAL, interval) + + def cancel(): + signal.setitimer(signal.ITIMER_VIRTUAL, 0) + atexit.register(cancel) + + signal.signal(signal.SIGVTALRM, sample) + # deliver the first signal in INTERVAL seconds + signal.setitimer(signal.ITIMER_VIRTUAL, interval)