From e5a7c8adfa721e711edb159d917081264abcf9db Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Wed, 6 Nov 2019 05:12:42 -0800 Subject: [PATCH] monitoring: Add "role" concept This lets us know if a service is MEANT to be running or not. --- monitoring/dashboards/overview.json | 57 +++++++++++++++++++++----- monitoring/dashboards/overview.jsonnet | 28 +++++++++---- monitoring/prometheus.jsonnet | 11 +++-- 3 files changed, 76 insertions(+), 20 deletions(-) diff --git a/monitoring/dashboards/overview.json b/monitoring/dashboards/overview.json index 84b3a5a..dcb6229 100644 --- a/monitoring/dashboards/overview.json +++ b/monitoring/dashboards/overview.json @@ -12,8 +12,8 @@ "panels": [ { "sort": { - "col": 0, - "desc": true + "col": 1, + "desc": false }, "styles": [ { @@ -169,7 +169,7 @@ "0.5" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "alias": "cutter", + "alias": "segment_coverage", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -227,7 +227,7 @@ "0.5" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "alias": "sheetsync", + "alias": "cutter", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -237,6 +237,35 @@ "pattern": "Value #F", "type": "string", "unit": "short" + }, + { + "mappingType": 1, + "valueMaps": [ + { + "text": "DOWN", + "value": "0" + }, + { + "text": "UP", + "value": "1" + } + ], + "colorMode": "cell", + "thresholds": [ + "0.5", + "0.5" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "alias": "sheetsync", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "Value #G", + "type": "string", + "unit": "short" } ], "span": 8, @@ -249,7 +278,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"restreamer\"}) by (instance)", + "expr": "sum(up{job=\"restreamer\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "A" @@ -257,7 +286,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"downloader\"}) by (instance)", + "expr": "sum(up{job=\"downloader\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "B" @@ -265,7 +294,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"backfiller\"}) by (instance)", + "expr": "sum(up{job=\"backfiller\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "C" @@ -273,7 +302,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"cutter\"}) by (instance)", + "expr": "sum(up{job=\"segment_coverage\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "D" @@ -281,7 +310,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"thrimshim\"}) by (instance)", + "expr": "sum(up{job=\"thrimshim\", role=~\"leader|edit|local_edit\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "E" @@ -289,10 +318,18 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"sheetsync\"}) by (instance)", + "expr": "sum(up{job=\"cutter\", role=~\"leader|edit\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "F" + }, + { + "instant": true, + "format": "table", + "expr": "sum(up{job=\"sheetsync\", role=~\"leader\"}) by (instance)", + "legendFormat": "", + "intervalFactor": 1, + "refId": "G" } ], "fontSize": "100%", diff --git a/monitoring/dashboards/overview.jsonnet b/monitoring/dashboards/overview.jsonnet index 1bd7fce..b38eede 100644 --- a/monitoring/dashboards/overview.jsonnet +++ b/monitoring/dashboards/overview.jsonnet @@ -1,13 +1,30 @@ local grafana = import "grafana.libsonnet"; +// Map from service to regex of matching roles. +// Role explanations: +// replica: Just downloads and replicates segments +// local_edit: Also runs a local thrimbletrimmer for doing local cuts +// edit: Also runs cutter for doing uploads +// leader: Also runs things that only run in one place, eg. sheetsync +local roles_for_service = { + "restreamer": ".*", + "downloader": ".*", + "backfiller": ".*", + "segment_coverage": ".*", + "thrimshim": "leader|edit|local_edit", + "cutter": "leader|edit", + "sheetsync": "leader", +}; + +// List of services, to impart ordering local services = [ "restreamer", "downloader", "backfiller", - "cutter", + "segment_coverage", "thrimshim", + "cutter", "sheetsync", - "segment_coverage", ]; local service_status_table = { @@ -15,7 +32,7 @@ local service_status_table = { type: "table", targets: [ { - expr: 'sum(up{job="%s"}) by (instance)' % services[i], + expr: 'sum(up{job="%s", role=~"%s"}) by (instance)' % [services[i], roles_for_service[services[i]]], intervalFactor: 1, format: "table", refId: refId(i), @@ -81,10 +98,7 @@ local service_status_table = { columns: [], scroll: true, fontSize: "100%", - sort: { - col: 0, - desc: true, - }, + sort: {col: 1, desc: false}, // sort by instance links: [], }; diff --git a/monitoring/prometheus.jsonnet b/monitoring/prometheus.jsonnet index bc2f274..b22ee5d 100644 --- a/monitoring/prometheus.jsonnet +++ b/monitoring/prometheus.jsonnet @@ -1,5 +1,7 @@ local hosts = { - // name: "host:port" + // name: ["host:port", role] + // See overview.jsonnet for role explanations. + mynode: ["localhost:8080", "replica"] }; local services = [ "restreamer", @@ -29,8 +31,11 @@ local services = [ metrics_path: "/metrics/%s" % service, static_configs: [ { - targets: [hosts[host]], - labels: {instance: host}, + targets: [hosts[host][0]], + labels: { + instance: host, + role: hosts[host][1], + }, } for host in std.objectFields(hosts) ], }