Merge pull request #149 from ekimekim/mike/monitoring/stuff

monitoring fixes and improvements
pull/150/head
Mike Lang 5 years ago committed by GitHub
commit 2accfa9080
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -12,8 +12,8 @@
"panels": [ "panels": [
{ {
"sort": { "sort": {
"col": 0, "col": 1,
"desc": true "desc": false
}, },
"styles": [ "styles": [
{ {
@ -169,7 +169,7 @@
"0.5" "0.5"
], ],
"dateFormat": "YYYY-MM-DD HH:mm:ss", "dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "cutter", "alias": "segment_coverage",
"colors": [ "colors": [
"rgba(245, 54, 54, 0.9)", "rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)", "rgba(237, 129, 40, 0.89)",
@ -227,7 +227,7 @@
"0.5" "0.5"
], ],
"dateFormat": "YYYY-MM-DD HH:mm:ss", "dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "sheetsync", "alias": "cutter",
"colors": [ "colors": [
"rgba(245, 54, 54, 0.9)", "rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)", "rgba(237, 129, 40, 0.89)",
@ -237,6 +237,35 @@
"pattern": "Value #F", "pattern": "Value #F",
"type": "string", "type": "string",
"unit": "short" "unit": "short"
},
{
"mappingType": 1,
"valueMaps": [
{
"text": "DOWN",
"value": "0"
},
{
"text": "UP",
"value": "1"
}
],
"colorMode": "cell",
"thresholds": [
"0.5",
"0.5"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "sheetsync",
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"decimals": 2,
"pattern": "Value #G",
"type": "string",
"unit": "short"
} }
], ],
"span": 8, "span": 8,
@ -249,7 +278,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"restreamer\"}) by (instance)", "expr": "sum(up{job=\"restreamer\", role=~\".*\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "A" "refId": "A"
@ -257,7 +286,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"downloader\"}) by (instance)", "expr": "sum(up{job=\"downloader\", role=~\".*\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "B" "refId": "B"
@ -265,7 +294,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"backfiller\"}) by (instance)", "expr": "sum(up{job=\"backfiller\", role=~\".*\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "C" "refId": "C"
@ -273,7 +302,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"cutter\"}) by (instance)", "expr": "sum(up{job=\"segment_coverage\", role=~\".*\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "D" "refId": "D"
@ -281,7 +310,7 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"thrimshim\"}) by (instance)", "expr": "sum(up{job=\"thrimshim\", role=~\"leader|edit|local_edit\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "E" "refId": "E"
@ -289,10 +318,18 @@
{ {
"instant": true, "instant": true,
"format": "table", "format": "table",
"expr": "sum(up{job=\"sheetsync\"}) by (instance)", "expr": "sum(up{job=\"cutter\", role=~\"leader|edit\"}) by (instance)",
"legendFormat": "", "legendFormat": "",
"intervalFactor": 1, "intervalFactor": 1,
"refId": "F" "refId": "F"
},
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"sheetsync\", role=~\"leader\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "G"
} }
], ],
"fontSize": "100%", "fontSize": "100%",

@ -1,11 +1,29 @@
local grafana = import "grafana.libsonnet"; local grafana = import "grafana.libsonnet";
// Map from service to regex of matching roles.
// Role explanations:
// replica: Just downloads and replicates segments
// local_edit: Also runs a local thrimbletrimmer for doing local cuts
// edit: Also runs cutter for doing uploads
// leader: Also runs things that only run in one place, eg. sheetsync
local roles_for_service = {
"restreamer": ".*",
"downloader": ".*",
"backfiller": ".*",
"segment_coverage": ".*",
"thrimshim": "leader|edit|local_edit",
"cutter": "leader|edit",
"sheetsync": "leader",
};
// List of services, to impart ordering
local services = [ local services = [
"restreamer", "restreamer",
"downloader", "downloader",
"backfiller", "backfiller",
"cutter", "segment_coverage",
"thrimshim", "thrimshim",
"cutter",
"sheetsync", "sheetsync",
]; ];
@ -14,7 +32,7 @@ local service_status_table = {
type: "table", type: "table",
targets: [ targets: [
{ {
expr: 'sum(up{job="%s"}) by (instance)' % services[i], expr: 'sum(up{job="%s", role=~"%s"}) by (instance)' % [services[i], roles_for_service[services[i]]],
intervalFactor: 1, intervalFactor: 1,
format: "table", format: "table",
refId: refId(i), refId: refId(i),
@ -80,10 +98,7 @@ local service_status_table = {
columns: [], columns: [],
scroll: true, scroll: true,
fontSize: "100%", fontSize: "100%",
sort: { sort: {col: 1, desc: false}, // sort by instance
col: 0,
desc: true,
},
links: [], links: [],
}; };

@ -3,7 +3,11 @@ services:
prometheus: prometheus:
image: "wubloader_prometheus:latest" image: "wubloader_prometheus:latest"
command: command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=30d" - "--storage.tsdb.retention=30d"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
restart: "on-failure" restart: "on-failure"
volumes: volumes:
- "/storage/wubloader-metrics/prometheus:/prometheus" - "/storage/wubloader-metrics/prometheus:/prometheus"

@ -1,8 +1,8 @@
local hosts = [ local hosts = {
"toodles.videostrike.team:1337", // name: ["host:port", role]
"http://136.24.9.73:20088", // See overview.jsonnet for role explanations.
"wubloader.codegunner.com", mynode: ["localhost:8080", "replica"]
]; };
local services = [ local services = [
"restreamer", "restreamer",
"downloader", "downloader",
@ -10,6 +10,7 @@ local services = [
"cutter", "cutter",
"thrimshim", "thrimshim",
"sheetsync", "sheetsync",
"segment_coverage",
]; ];
{ {
@ -18,14 +19,25 @@ local services = [
scrape_interval: "15s", scrape_interval: "15s",
}, },
scrape_configs: [ scrape_configs: [
{job_name: "prometheus", static_configs: [{targets: ["localhost:9090"]}]}, {
job_name: "prometheus",
static_configs: [
{targets: ["localhost:9090"], labels: {instance: "prometheus"}}
],
},
] + [ ] + [
{ {
job_name: service, job_name: service,
metrics_path: "/metrics/%s" % service, metrics_path: "/metrics/%s" % service,
static_configs: [{ static_configs: [
targets: hosts, {
}], targets: [hosts[host][0]],
labels: {
instance: host,
role: hosts[host][1],
},
} for host in std.objectFields(hosts)
],
} }
for service in services for service in services
], ],

Loading…
Cancel
Save