Merge pull request #149 from ekimekim/mike/monitoring/stuff

monitoring fixes and improvements
pull/150/head
Mike Lang 5 years ago committed by GitHub
commit 2accfa9080
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -12,8 +12,8 @@
"panels": [
{
"sort": {
"col": 0,
"desc": true
"col": 1,
"desc": false
},
"styles": [
{
@ -169,7 +169,7 @@
"0.5"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "cutter",
"alias": "segment_coverage",
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
@ -227,7 +227,7 @@
"0.5"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "sheetsync",
"alias": "cutter",
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
@ -237,6 +237,35 @@
"pattern": "Value #F",
"type": "string",
"unit": "short"
},
{
"mappingType": 1,
"valueMaps": [
{
"text": "DOWN",
"value": "0"
},
{
"text": "UP",
"value": "1"
}
],
"colorMode": "cell",
"thresholds": [
"0.5",
"0.5"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"alias": "sheetsync",
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"decimals": 2,
"pattern": "Value #G",
"type": "string",
"unit": "short"
}
],
"span": 8,
@ -249,7 +278,7 @@
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"restreamer\"}) by (instance)",
"expr": "sum(up{job=\"restreamer\", role=~\".*\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "A"
@ -257,7 +286,7 @@
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"downloader\"}) by (instance)",
"expr": "sum(up{job=\"downloader\", role=~\".*\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "B"
@ -265,7 +294,7 @@
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"backfiller\"}) by (instance)",
"expr": "sum(up{job=\"backfiller\", role=~\".*\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "C"
@ -273,7 +302,7 @@
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"cutter\"}) by (instance)",
"expr": "sum(up{job=\"segment_coverage\", role=~\".*\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "D"
@ -281,7 +310,7 @@
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"thrimshim\"}) by (instance)",
"expr": "sum(up{job=\"thrimshim\", role=~\"leader|edit|local_edit\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "E"
@ -289,10 +318,18 @@
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"sheetsync\"}) by (instance)",
"expr": "sum(up{job=\"cutter\", role=~\"leader|edit\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "F"
},
{
"instant": true,
"format": "table",
"expr": "sum(up{job=\"sheetsync\", role=~\"leader\"}) by (instance)",
"legendFormat": "",
"intervalFactor": 1,
"refId": "G"
}
],
"fontSize": "100%",

@ -1,11 +1,29 @@
local grafana = import "grafana.libsonnet";
// Map from service to regex of matching roles.
// Role explanations:
// replica: Just downloads and replicates segments
// local_edit: Also runs a local thrimbletrimmer for doing local cuts
// edit: Also runs cutter for doing uploads
// leader: Also runs things that only run in one place, eg. sheetsync
local roles_for_service = {
"restreamer": ".*",
"downloader": ".*",
"backfiller": ".*",
"segment_coverage": ".*",
"thrimshim": "leader|edit|local_edit",
"cutter": "leader|edit",
"sheetsync": "leader",
};
// List of services, to impart ordering
local services = [
"restreamer",
"downloader",
"backfiller",
"cutter",
"segment_coverage",
"thrimshim",
"cutter",
"sheetsync",
];
@ -14,7 +32,7 @@ local service_status_table = {
type: "table",
targets: [
{
expr: 'sum(up{job="%s"}) by (instance)' % services[i],
expr: 'sum(up{job="%s", role=~"%s"}) by (instance)' % [services[i], roles_for_service[services[i]]],
intervalFactor: 1,
format: "table",
refId: refId(i),
@ -80,10 +98,7 @@ local service_status_table = {
columns: [],
scroll: true,
fontSize: "100%",
sort: {
col: 0,
desc: true,
},
sort: {col: 1, desc: false}, // sort by instance
links: [],
};

@ -3,7 +3,11 @@ services:
prometheus:
image: "wubloader_prometheus:latest"
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=30d"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
restart: "on-failure"
volumes:
- "/storage/wubloader-metrics/prometheus:/prometheus"

@ -1,8 +1,8 @@
local hosts = [
"toodles.videostrike.team:1337",
"http://136.24.9.73:20088",
"wubloader.codegunner.com",
];
local hosts = {
// name: ["host:port", role]
// See overview.jsonnet for role explanations.
mynode: ["localhost:8080", "replica"]
};
local services = [
"restreamer",
"downloader",
@ -10,6 +10,7 @@ local services = [
"cutter",
"thrimshim",
"sheetsync",
"segment_coverage",
];
{
@ -18,14 +19,25 @@ local services = [
scrape_interval: "15s",
},
scrape_configs: [
{job_name: "prometheus", static_configs: [{targets: ["localhost:9090"]}]},
{
job_name: "prometheus",
static_configs: [
{targets: ["localhost:9090"], labels: {instance: "prometheus"}}
],
},
] + [
{
job_name: service,
metrics_path: "/metrics/%s" % service,
static_configs: [{
targets: hosts,
}],
static_configs: [
{
targets: [hosts[host][0]],
labels: {
instance: host,
role: hosts[host][1],
},
} for host in std.objectFields(hosts)
],
}
for service in services
],

Loading…
Cancel
Save