From 39e7a5c2e6626f7990edf947fa65371e661fd101 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Mon, 24 Jun 2019 03:36:24 -0700 Subject: [PATCH 1/2] Add overview dashboard --- monitoring/dashboards/README | 3 + monitoring/dashboards/overview.jsonnet | 148 +++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 monitoring/dashboards/README create mode 100644 monitoring/dashboards/overview.jsonnet diff --git a/monitoring/dashboards/README b/monitoring/dashboards/README new file mode 100644 index 0000000..8aabe2e --- /dev/null +++ b/monitoring/dashboards/README @@ -0,0 +1,3 @@ +The files in this directory use a library for generating grafana dashboards +that at time of writing I'm not at liberty to share. Unfortunately, I don't have a better option. +So feel free to make changes, but I'll need to be the one to regenerate them. diff --git a/monitoring/dashboards/overview.jsonnet b/monitoring/dashboards/overview.jsonnet new file mode 100644 index 0000000..e30b331 --- /dev/null +++ b/monitoring/dashboards/overview.jsonnet @@ -0,0 +1,148 @@ +local grafana = import "grafana.libsonnet"; + +local services = [ + "restreamer", + "downloader", + "backfiller", + "cutter", + "thrimshim", + "sheetsync", +]; + +local service_status_table = { + local refId(n) = std.char(std.codepoint('A') + n), + type: "table", + targets: [ + { + expr: 'sum(up{job="%s"}) by (instance)' % services[i], + intervalFactor: 1, + format: "table", + refId: refId(i), + legendFormat: "", + instant: true, + } + for i in std.range(0, std.length(services) - 1) + ], + styles: [ + // hidden cols + { + unit: "short", + type: "hidden", + alias: "", + decimals: 2, + colors: [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)", + ], + colorMode: null, + pattern: name, + dateFormat: "YYYY-MM-DD HH:mm:ss", + thresholds: [], + mappingType: 1, + } + for name in ["__name__", "job", "Time"] + ] + [ + // service cols + { + unit: "short", + type: "string", + alias: services[i], + decimals: 2, + colors: [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)", + ], + colorMode: "cell", + pattern: "Value #%s" % refId(i), + dateFormat: "YYYY-MM-DD HH:mm:ss", + thresholds: [ + "0.5", + "0.5", + ], + mappingType: 1, + valueMaps: [ + { + value: "0", + text: "DOWN", + }, + { + value: "1", + text: "UP", + }, + ], + } for i in std.range(0, std.length(services) - 1) + ], + transform: "table", + pageSize: null, + showHeader: true, + columns: [], + scroll: true, + fontSize: "100%", + sort: { + col: 0, + desc: true, + }, + links: [], +}; + +grafana.dashboard({ + name: "Overview", + uid: "rjd405mn", + + rows: [ + + { + panels: [ + // First row - immediate status heads-up + [ + { + name: "Service Status by Node", + span: 2 * grafana.span.third, + custom: service_status_table, + }, + { + name: "Error log rate", + axis: {min: 0, label: "logs / sec"}, + display: "bars", + expressions: { + "{{job}} {{level}}({{module}}:{{function}})": ||| + sum(irate(log_count_total{level!="INFO"}[2m])) by (job, level, module, function) > 0 + |||, + }, + }, + ], + // Second row - core "business" metrics + [ + { + name: "Segments downloaded", + axis: {min: 0, label: "segments / sec"}, + expressions: { + "{{stream}}({{variant}}) live capture": + 'sum(rate(segments_downloaded_total[2m])) by (stream, variant)', + "{{stream}}({{variant}}) backfilled": + 'sum(rate(segments_backfilled_total[2m])) by (stream, variant)', + }, + }, + { + name: "Successful requests by endpoint", + axis: {min: 0, label: "requests / sec"}, + expressions: { + "{{method}} {{endpoint}}": + 'sum(rate(http_request_latency_all_count{status="200"}[2m])) by (endpoint, method)', + }, + }, + { + name: "Database events by state", + axis: {min: 0, label: "events"}, + tooltip: "Not implemented", // TODO + expressions: {"Not implemented": "0"}, + }, + ], + ], + }, + + ], + +}) From ca925ae2e60de8551636a69ca26360a822eff6c7 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Fri, 28 Jun 2019 18:28:10 -0700 Subject: [PATCH 2/2] dashboard: Add some extra detail sections for backfiller and downloader --- monitoring/dashboards/overview.jsonnet | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/monitoring/dashboards/overview.jsonnet b/monitoring/dashboards/overview.jsonnet index e30b331..139c139 100644 --- a/monitoring/dashboards/overview.jsonnet +++ b/monitoring/dashboards/overview.jsonnet @@ -143,6 +143,43 @@ grafana.dashboard({ ], }, + { + name: "Downloader", + panels: [ + { + name: "Segments downloaded by node", + axis: {min: 0, label: "segments / sec"}, + expressions: { + "{{instance}} {{stream}}({{variant}})": + 'sum(rate(segments_downloaded_total[2m])) by (instance, stream, variant)', + }, + }, + { + name: "Downloader stream delay by node", + tooltip: "Time between the latest downloaded segment's timestamp and current time", + axis: {min: 0, format: grafana.formats.time}, + expressions: { + "{{instance}} {{stream}}({{variant}})": + 'time() - max(latest_segment) by (instance, stream, variant)', + }, + }, + ], + }, + + { + name: "Backfiller", + panels: [ + { + name: "Backfill by node pair", + axis: {min: 0, label: "segments / sec"}, + expressions: { + "{{remote}} -> {{instance}}": + 'sum(rate(segments_backfilled_total[2m])) by (remote, instance)', + }, + }, + ], + }, + ], })