From b84d4de085015913f94fdcabd8e1fd70e5b90510 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Wed, 6 Nov 2019 04:55:08 -0800 Subject: [PATCH 1/4] Add segment_coverage service to be monitored --- monitoring/dashboards/overview.jsonnet | 1 + monitoring/prometheus.jsonnet | 1 + 2 files changed, 2 insertions(+) diff --git a/monitoring/dashboards/overview.jsonnet b/monitoring/dashboards/overview.jsonnet index 09d1c90..1bd7fce 100644 --- a/monitoring/dashboards/overview.jsonnet +++ b/monitoring/dashboards/overview.jsonnet @@ -7,6 +7,7 @@ local services = [ "cutter", "thrimshim", "sheetsync", + "segment_coverage", ]; local service_status_table = { diff --git a/monitoring/prometheus.jsonnet b/monitoring/prometheus.jsonnet index df792e1..faf27c9 100644 --- a/monitoring/prometheus.jsonnet +++ b/monitoring/prometheus.jsonnet @@ -10,6 +10,7 @@ local services = [ "cutter", "thrimshim", "sheetsync", + "segment_coverage", ]; { From 51adeeab192d3f288ffc643650741453e97cc6c8 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Wed, 6 Nov 2019 04:55:35 -0800 Subject: [PATCH 2/4] monitoring: Fix problems with the prometheus container --- monitoring/docker-compose.yml | 4 ++++ monitoring/prometheus.jsonnet | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 462f3fb..b9b659d 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -3,7 +3,11 @@ services: prometheus: image: "wubloader_prometheus:latest" command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention=30d" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" restart: "on-failure" volumes: - "/storage/wubloader-metrics/prometheus:/prometheus" diff --git a/monitoring/prometheus.jsonnet b/monitoring/prometheus.jsonnet index faf27c9..3c58d9d 100644 --- a/monitoring/prometheus.jsonnet +++ b/monitoring/prometheus.jsonnet @@ -1,7 +1,4 @@ local hosts = [ - "toodles.videostrike.team:1337", - "http://136.24.9.73:20088", - "wubloader.codegunner.com", ]; local services = [ "restreamer", From 21a46a66bb654b4e3aca21115eae868f9e0a7838 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Wed, 6 Nov 2019 04:56:41 -0800 Subject: [PATCH 3/4] monitoring: Set instance to friendly name for each node we're monitoring So that you get eg. "charm" instead of "IP:PORT" --- monitoring/prometheus.jsonnet | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/monitoring/prometheus.jsonnet b/monitoring/prometheus.jsonnet index 3c58d9d..bc2f274 100644 --- a/monitoring/prometheus.jsonnet +++ b/monitoring/prometheus.jsonnet @@ -1,5 +1,6 @@ -local hosts = [ -]; +local hosts = { + // name: "host:port" +}; local services = [ "restreamer", "downloader", @@ -16,14 +17,22 @@ local services = [ scrape_interval: "15s", }, scrape_configs: [ - {job_name: "prometheus", static_configs: [{targets: ["localhost:9090"]}]}, + { + job_name: "prometheus", + static_configs: [ + {targets: ["localhost:9090"], labels: {instance: "prometheus"}} + ], + }, ] + [ { job_name: service, metrics_path: "/metrics/%s" % service, - static_configs: [{ - targets: hosts, - }], + static_configs: [ + { + targets: [hosts[host]], + labels: {instance: host}, + } for host in std.objectFields(hosts) + ], } for service in services ], From e5a7c8adfa721e711edb159d917081264abcf9db Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Wed, 6 Nov 2019 05:12:42 -0800 Subject: [PATCH 4/4] monitoring: Add "role" concept This lets us know if a service is MEANT to be running or not. --- monitoring/dashboards/overview.json | 57 +++++++++++++++++++++----- monitoring/dashboards/overview.jsonnet | 28 +++++++++---- monitoring/prometheus.jsonnet | 11 +++-- 3 files changed, 76 insertions(+), 20 deletions(-) diff --git a/monitoring/dashboards/overview.json b/monitoring/dashboards/overview.json index 84b3a5a..dcb6229 100644 --- a/monitoring/dashboards/overview.json +++ b/monitoring/dashboards/overview.json @@ -12,8 +12,8 @@ "panels": [ { "sort": { - "col": 0, - "desc": true + "col": 1, + "desc": false }, "styles": [ { @@ -169,7 +169,7 @@ "0.5" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "alias": "cutter", + "alias": "segment_coverage", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -227,7 +227,7 @@ "0.5" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "alias": "sheetsync", + "alias": "cutter", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -237,6 +237,35 @@ "pattern": "Value #F", "type": "string", "unit": "short" + }, + { + "mappingType": 1, + "valueMaps": [ + { + "text": "DOWN", + "value": "0" + }, + { + "text": "UP", + "value": "1" + } + ], + "colorMode": "cell", + "thresholds": [ + "0.5", + "0.5" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "alias": "sheetsync", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "Value #G", + "type": "string", + "unit": "short" } ], "span": 8, @@ -249,7 +278,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"restreamer\"}) by (instance)", + "expr": "sum(up{job=\"restreamer\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "A" @@ -257,7 +286,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"downloader\"}) by (instance)", + "expr": "sum(up{job=\"downloader\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "B" @@ -265,7 +294,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"backfiller\"}) by (instance)", + "expr": "sum(up{job=\"backfiller\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "C" @@ -273,7 +302,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"cutter\"}) by (instance)", + "expr": "sum(up{job=\"segment_coverage\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "D" @@ -281,7 +310,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"thrimshim\"}) by (instance)", + "expr": "sum(up{job=\"thrimshim\", role=~\"leader|edit|local_edit\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "E" @@ -289,10 +318,18 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"sheetsync\"}) by (instance)", + "expr": "sum(up{job=\"cutter\", role=~\"leader|edit\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "F" + }, + { + "instant": true, + "format": "table", + "expr": "sum(up{job=\"sheetsync\", role=~\"leader\"}) by (instance)", + "legendFormat": "", + "intervalFactor": 1, + "refId": "G" } ], "fontSize": "100%", diff --git a/monitoring/dashboards/overview.jsonnet b/monitoring/dashboards/overview.jsonnet index 1bd7fce..b38eede 100644 --- a/monitoring/dashboards/overview.jsonnet +++ b/monitoring/dashboards/overview.jsonnet @@ -1,13 +1,30 @@ local grafana = import "grafana.libsonnet"; +// Map from service to regex of matching roles. +// Role explanations: +// replica: Just downloads and replicates segments +// local_edit: Also runs a local thrimbletrimmer for doing local cuts +// edit: Also runs cutter for doing uploads +// leader: Also runs things that only run in one place, eg. sheetsync +local roles_for_service = { + "restreamer": ".*", + "downloader": ".*", + "backfiller": ".*", + "segment_coverage": ".*", + "thrimshim": "leader|edit|local_edit", + "cutter": "leader|edit", + "sheetsync": "leader", +}; + +// List of services, to impart ordering local services = [ "restreamer", "downloader", "backfiller", - "cutter", + "segment_coverage", "thrimshim", + "cutter", "sheetsync", - "segment_coverage", ]; local service_status_table = { @@ -15,7 +32,7 @@ local service_status_table = { type: "table", targets: [ { - expr: 'sum(up{job="%s"}) by (instance)' % services[i], + expr: 'sum(up{job="%s", role=~"%s"}) by (instance)' % [services[i], roles_for_service[services[i]]], intervalFactor: 1, format: "table", refId: refId(i), @@ -81,10 +98,7 @@ local service_status_table = { columns: [], scroll: true, fontSize: "100%", - sort: { - col: 0, - desc: true, - }, + sort: {col: 1, desc: false}, // sort by instance links: [], }; diff --git a/monitoring/prometheus.jsonnet b/monitoring/prometheus.jsonnet index bc2f274..b22ee5d 100644 --- a/monitoring/prometheus.jsonnet +++ b/monitoring/prometheus.jsonnet @@ -1,5 +1,7 @@ local hosts = { - // name: "host:port" + // name: ["host:port", role] + // See overview.jsonnet for role explanations. + mynode: ["localhost:8080", "replica"] }; local services = [ "restreamer", @@ -29,8 +31,11 @@ local services = [ metrics_path: "/metrics/%s" % service, static_configs: [ { - targets: [hosts[host]], - labels: {instance: host}, + targets: [hosts[host][0]], + labels: { + instance: host, + role: hosts[host][1], + }, } for host in std.objectFields(hosts) ], }