wubloader/monitoring/dashboards/overview.jsonnet

local grafana = import "grafana.libsonnet";

// Map from service to regex of matching roles.
// Role explanations:
//  replica: Just downloads and replicates segments
//  local_edit: Also runs a local thrimbletrimmer for doing local cuts
//  edit: Also runs cutter for doing uploads
//  leader: Also runs things that only run in one place, eg. sheetsync
local roles_for_service = {
  "restreamer": ".*",
  "downloader": ".*",
  "backfiller": ".*",
  "segment_coverage": ".*",
  "thrimshim": "leader|edit|local_edit",
  "cutter": "leader|edit",
  "sheetsync": "leader",
};

// List of services, to impart ordering
local services = [
  "restreamer",
  "downloader",
  "backfiller",
  "segment_coverage",
  "thrimshim",
  "cutter",
  "sheetsync",
];

local service_status_table = {
  local refId(n) = std.char(std.codepoint('A') + n),
  type: "table",
  targets: [
    {
      expr: 'sum(up{service="%s", role=~"%s"}) by (instance)' % [services[i], roles_for_service[services[i]]],
      intervalFactor: 1,
      format: "table",
      refId: refId(i),
      legendFormat: "",
      instant: true,
    }
    for i in std.range(0, std.length(services) - 1)
  ],
  styles: [
    // hidden cols
    {
      unit: "short",
      type: "hidden",
      alias: "",
      decimals: 2,
      colors: [
        "rgba(245, 54, 54, 0.9)",
        "rgba(237, 129, 40, 0.89)",
        "rgba(50, 172, 45, 0.97)",
      ],
      colorMode: null,
      pattern: name,
      dateFormat: "YYYY-MM-DD HH:mm:ss",
      thresholds: [],
      mappingType: 1,
    }
    for name in ["__name__", "service", "Time"]
  ] + [
    // service cols
    {
      unit: "short",
      type: "string",
      alias: services[i],
      decimals: 2,
      colors: [
        "rgba(245, 54, 54, 0.9)",
        "rgba(237, 129, 40, 0.89)",
        "rgba(50, 172, 45, 0.97)",
      ],
      colorMode: "cell",
      pattern: "Value #%s" % refId(i),
      dateFormat: "YYYY-MM-DD HH:mm:ss",
      thresholds: [
        "0.5",
        "0.5",
      ],
      mappingType: 1,
      valueMaps: [
        {
          value: "0",
          text: "DOWN",
        },
        {
          value: "1",
          text: "UP",
        },
      ],
    } for i in std.range(0, std.length(services) - 1)
  ],
  transform: "table",
  pageSize: null,
  showHeader: true,
  columns: [],
  scroll: true,
  fontSize: "100%",
  sort: {col: 1, desc: false}, // sort by instance
  links: [],
};

local labels = {
  labels: 'instance=~"$instance"'
};

grafana.dashboard({
  name: "Overview",
  uid: "rjd405mn",
  refresh: "30s",

  templates: [
    {
      name: "instance",
      query: 'label_values(up, instance)'
    },
  ],

  rows: [

    {
      panels: [
        // First row - immediate status heads-up
        [
          {
            name: "Service Status by Node",
            span: 2 * grafana.span.third,
            custom: service_status_table,
          },
          {
            name: "Error log rate",
            axis: {min: 0, label: "logs / sec"},
            display: "bars",
            expressions: {
              "{{instance}} {{service}} {{level}}({{module}}:{{function}})": |||
                sum(irate(log_count_total{level!="INFO", %(labels)s}[2m])) by (instance, service, level, module, function) > 0
              ||| % labels,
            },
          },
        ],
        // Second row - core "business" metrics
        [
          {
            name: "Segments downloaded",
            axis: {min: 0, label: "segments / sec"},
            expressions: {
              "{{channel}}({{quality}}) live capture":
                'sum(rate(segments_downloaded_total{%(labels)s}[2m])) by (channel, quality)' % labels,
              "{{channel}}({{quality}}) backfilled":
                'sum(rate(segments_backfilled_total{%(labels)s}[2m])) by (channel, quality)' % labels,
            },
          },
          {
            name: "Successful requests by endpoint",
            axis: {min: 0, label: "requests / sec"},
            expressions: {
              "{{method}} {{endpoint}}":
                'sum(rate(http_request_latency_all_count{status="200", %(labels)s}[2m])) by (endpoint, method)' % labels,
            },
          },
          {
            name: "Database events by state",
            axis: {min: 0, label: "events"},
            stack: true,
            tooltip: "Does not include UNEDITED or DONE events",
            expressions: {
              "{{state}}": |||
                sum(event_counts{state!="UNEDITED", state!="DONE", %(labels)s}) by (state)
              ||| % labels,
            },
          },
        ],
        // Third row - process-level health
        [
          {
            name: "CPU usage",
            axis: {min: 0, label: "cores", format: grafana.formats.percent},
            expressions: {
              "{{instance}} {{service}}": |||
                sum by (instance, service) (
                  rate(process_cpu_seconds_total{%(labels)s}[2m])
                )
              ||| % labels,
            },
          },
          {
            name: "Memory usage (RSS)",
            axis: {min: 0, format: grafana.formats.bytes},
            expressions: {
              "{{instance}} {{service}}": "process_resident_memory_bytes{%(labels)s}" % labels,
            },
          },
          {
            name: "Process restarts",
            axis: {min: 0, label: "restarts within last minute"},
            tooltip: "Multiple restarts within 15sec will be missed, and only counted as one.",
            expressions: {
              "{{instance}} {{service}}": "changes(process_start_time_seconds{%(labels)s}[1m])" % labels,
            },
          },
        ],
      ],
    },

    {
      name: "Downloader",
      panels: [
        {
          name: "Segments downloaded by node",
          axis: {min: 0, label: "segments / sec"},
          expressions: {
            "{{instance}} {{channel}}({{quality}})":
              'sum(rate(segments_downloaded_total{%(labels)s}[2m])) by (instance, channel, quality)' % labels,
          },
        },
        {
          name: "Downloader stream delay by node",
          tooltip: "Time between the latest downloaded segment's timestamp and current time",
          axis: {min: 0, format: grafana.formats.time},
          expressions: {
            "{{instance}} {{channel}}({{quality}})":
              // Ignore series where we're no longer fetching segments,
              // as they just show that it's been a long time since the last segment.
              |||
                time() - max(latest_segment{%(labels)s}) by (instance, channel, quality)
                and sum(irate(segments_downloaded_total{%(labels)s}[2m])) by (instance, channel, quality) > 0
              ||| % labels,
          },
        },
      ],
    },

    {
      name: "Backfiller",
      panels: [
        {
          name: "Backfill by node pair",
          axis: {min: 0, label: "segments / sec"},
          expressions: {
            "{{remote}} -> {{instance}}":
              'sum(rate(segments_backfilled_total{%(labels)s}[2m])) by (remote, instance)' % labels,
          },
        },
      ],
    },

  ],

})