diff --git a/monitoring/dashboards/overview.json b/monitoring/dashboards/overview.json index 84b3a5a..dcb6229 100644 --- a/monitoring/dashboards/overview.json +++ b/monitoring/dashboards/overview.json @@ -12,8 +12,8 @@ "panels": [ { "sort": { - "col": 0, - "desc": true + "col": 1, + "desc": false }, "styles": [ { @@ -169,7 +169,7 @@ "0.5" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "alias": "cutter", + "alias": "segment_coverage", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -227,7 +227,7 @@ "0.5" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "alias": "sheetsync", + "alias": "cutter", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -237,6 +237,35 @@ "pattern": "Value #F", "type": "string", "unit": "short" + }, + { + "mappingType": 1, + "valueMaps": [ + { + "text": "DOWN", + "value": "0" + }, + { + "text": "UP", + "value": "1" + } + ], + "colorMode": "cell", + "thresholds": [ + "0.5", + "0.5" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "alias": "sheetsync", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "Value #G", + "type": "string", + "unit": "short" } ], "span": 8, @@ -249,7 +278,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"restreamer\"}) by (instance)", + "expr": "sum(up{job=\"restreamer\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "A" @@ -257,7 +286,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"downloader\"}) by (instance)", + "expr": "sum(up{job=\"downloader\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "B" @@ -265,7 +294,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"backfiller\"}) by (instance)", + "expr": "sum(up{job=\"backfiller\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "C" @@ -273,7 +302,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"cutter\"}) by (instance)", + "expr": "sum(up{job=\"segment_coverage\", role=~\".*\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "D" @@ -281,7 +310,7 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"thrimshim\"}) by (instance)", + "expr": "sum(up{job=\"thrimshim\", role=~\"leader|edit|local_edit\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "E" @@ -289,10 +318,18 @@ { "instant": true, "format": "table", - "expr": "sum(up{job=\"sheetsync\"}) by (instance)", + "expr": "sum(up{job=\"cutter\", role=~\"leader|edit\"}) by (instance)", "legendFormat": "", "intervalFactor": 1, "refId": "F" + }, + { + "instant": true, + "format": "table", + "expr": "sum(up{job=\"sheetsync\", role=~\"leader\"}) by (instance)", + "legendFormat": "", + "intervalFactor": 1, + "refId": "G" } ], "fontSize": "100%", diff --git a/monitoring/dashboards/overview.jsonnet b/monitoring/dashboards/overview.jsonnet index 09d1c90..b38eede 100644 --- a/monitoring/dashboards/overview.jsonnet +++ b/monitoring/dashboards/overview.jsonnet @@ -1,11 +1,29 @@ local grafana = import "grafana.libsonnet"; +// Map from service to regex of matching roles. +// Role explanations: +// replica: Just downloads and replicates segments +// local_edit: Also runs a local thrimbletrimmer for doing local cuts +// edit: Also runs cutter for doing uploads +// leader: Also runs things that only run in one place, eg. sheetsync +local roles_for_service = { + "restreamer": ".*", + "downloader": ".*", + "backfiller": ".*", + "segment_coverage": ".*", + "thrimshim": "leader|edit|local_edit", + "cutter": "leader|edit", + "sheetsync": "leader", +}; + +// List of services, to impart ordering local services = [ "restreamer", "downloader", "backfiller", - "cutter", + "segment_coverage", "thrimshim", + "cutter", "sheetsync", ]; @@ -14,7 +32,7 @@ local service_status_table = { type: "table", targets: [ { - expr: 'sum(up{job="%s"}) by (instance)' % services[i], + expr: 'sum(up{job="%s", role=~"%s"}) by (instance)' % [services[i], roles_for_service[services[i]]], intervalFactor: 1, format: "table", refId: refId(i), @@ -80,10 +98,7 @@ local service_status_table = { columns: [], scroll: true, fontSize: "100%", - sort: { - col: 0, - desc: true, - }, + sort: {col: 1, desc: false}, // sort by instance links: [], }; diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 462f3fb..b9b659d 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -3,7 +3,11 @@ services: prometheus: image: "wubloader_prometheus:latest" command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention=30d" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" restart: "on-failure" volumes: - "/storage/wubloader-metrics/prometheus:/prometheus" diff --git a/monitoring/prometheus.jsonnet b/monitoring/prometheus.jsonnet index df792e1..b22ee5d 100644 --- a/monitoring/prometheus.jsonnet +++ b/monitoring/prometheus.jsonnet @@ -1,8 +1,8 @@ -local hosts = [ - "toodles.videostrike.team:1337", - "http://136.24.9.73:20088", - "wubloader.codegunner.com", -]; +local hosts = { + // name: ["host:port", role] + // See overview.jsonnet for role explanations. + mynode: ["localhost:8080", "replica"] +}; local services = [ "restreamer", "downloader", @@ -10,6 +10,7 @@ local services = [ "cutter", "thrimshim", "sheetsync", + "segment_coverage", ]; { @@ -18,14 +19,25 @@ local services = [ scrape_interval: "15s", }, scrape_configs: [ - {job_name: "prometheus", static_configs: [{targets: ["localhost:9090"]}]}, + { + job_name: "prometheus", + static_configs: [ + {targets: ["localhost:9090"], labels: {instance: "prometheus"}} + ], + }, ] + [ { job_name: service, metrics_path: "/metrics/%s" % service, - static_configs: [{ - targets: hosts, - }], + static_configs: [ + { + targets: [hosts[host][0]], + labels: { + instance: host, + role: hosts[host][1], + }, + } for host in std.objectFields(hosts) + ], } for service in services ],