monitoring: Add "role" concept

This lets us know if a service is MEANT to be running or not.
pull/149/head
Mike Lang 5 years ago
parent 21a46a66bb
commit e5a7c8adfa

@ -12,8 +12,8 @@
"panels": [ "panels": [
{ {
"sort": { "sort": {
"col": 0, "col": 1,
"desc": true "desc": false
}, },
"styles": [ "styles": [
{ {
@ -169,7 +169,7 @@
"0.5" "0.5"
], ],
"dateFormat": "YYYY-MM-DD HH:mm:ss", "dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "cutter", "alias": "segment_coverage",
"colors": [ "colors": [
"rgba(245, 54, 54, 0.9)", "rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)", "rgba(237, 129, 40, 0.89)",
@ -227,7 +227,7 @@
"0.5" "0.5"
], ],
"dateFormat": "YYYY-MM-DD HH:mm:ss", "dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "sheetsync", "alias": "cutter",
"colors": [ "colors": [
"rgba(245, 54, 54, 0.9)", "rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)", "rgba(237, 129, 40, 0.89)",
@ -237,6 +237,35 @@
"pattern": "Value #F", "pattern": "Value #F",
"type": "string", "type": "string",
"unit": "short" "unit": "short"
},
{
"mappingType": 1,
"valueMaps": [
{
"text": "DOWN",
"value": "0"
},
{
"text": "UP",
"value": "1"
}
],
"colorMode": "cell",
"thresholds": [
"0.5",
"0.5"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "sheetsync",
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"decimals": 2,
"pattern": "Value #G",
"type": "string",
"unit": "short"
} }
], ],
"span": 8, "span": 8,
@ -249,7 +278,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"restreamer\"}) by (instance)", "expr": "sum(up{job=\"restreamer\", role=~\".*\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "A" "refId": "A"
@ -257,7 +286,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"downloader\"}) by (instance)", "expr": "sum(up{job=\"downloader\", role=~\".*\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "B" "refId": "B"
@ -265,7 +294,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"backfiller\"}) by (instance)", "expr": "sum(up{job=\"backfiller\", role=~\".*\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "C" "refId": "C"
@ -273,7 +302,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"cutter\"}) by (instance)", "expr": "sum(up{job=\"segment_coverage\", role=~\".*\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "D" "refId": "D"
@ -281,7 +310,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"thrimshim\"}) by (instance)", "expr": "sum(up{job=\"thrimshim\", role=~\"leader|edit|local_edit\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "E" "refId": "E"
@ -289,10 +318,18 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"sheetsync\"}) by (instance)", "expr": "sum(up{job=\"cutter\", role=~\"leader|edit\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "F" "refId": "F"
},
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"sheetsync\", role=~\"leader\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "G"
} }
], ],
"fontSize": "100%", "fontSize": "100%",

@ -1,13 +1,30 @@
local grafana = import "grafana.libsonnet"; local grafana = import "grafana.libsonnet";
// Map from service to regex of matching roles.
// Role explanations:
// replica: Just downloads and replicates segments
// local_edit: Also runs a local thrimbletrimmer for doing local cuts
// edit: Also runs cutter for doing uploads
// leader: Also runs things that only run in one place, eg. sheetsync
local roles_for_service = {
"restreamer": ".*",
"downloader": ".*",
"backfiller": ".*",
"segment_coverage": ".*",
"thrimshim": "leader|edit|local_edit",
"cutter": "leader|edit",
"sheetsync": "leader",
};
// List of services, to impart ordering
local services = [ local services = [
"restreamer", "restreamer",
"downloader", "downloader",
"backfiller", "backfiller",
"cutter", "segment_coverage",
"thrimshim", "thrimshim",
"cutter",
"sheetsync", "sheetsync",
"segment_coverage",
]; ];
local service_status_table = { local service_status_table = {
@ -15,7 +32,7 @@ local service_status_table = {
type: "table", type: "table",
targets: [ targets: [
{ {
expr: 'sum(up{job="%s"}) by (instance)' % services[i], expr: 'sum(up{job="%s", role=~"%s"}) by (instance)' % [services[i], roles_for_service[services[i]]],
intervalFactor: 1, intervalFactor: 1,
format: "table", format: "table",
refId: refId(i), refId: refId(i),
@ -81,10 +98,7 @@ local service_status_table = {
columns: [], columns: [],
scroll: true, scroll: true,
fontSize: "100%", fontSize: "100%",
sort: { sort: {col: 1, desc: false}, // sort by instance
col: 0,
desc: true,
},
links: [], links: [],
}; };

@ -1,5 +1,7 @@
local hosts = { local hosts = {
// name: "host:port" // name: ["host:port", role]
// See overview.jsonnet for role explanations.
mynode: ["localhost:8080", "replica"]
}; };
local services = [ local services = [
"restreamer", "restreamer",
@ -29,8 +31,11 @@ local services = [
metrics_path: "/metrics/%s" % service, metrics_path: "/metrics/%s" % service,
static_configs: [ static_configs: [
{ {
targets: [hosts[host]], targets: [hosts[host][0]],
labels: {instance: host}, labels: {
instance: host,
role: hosts[host][1],
},
} for host in std.objectFields(hosts) } for host in std.objectFields(hosts)
], ],
} }

Loading…
Cancel
Save