diff --git a/monitoring/dashboards/README b/monitoring/dashboards/README new file mode 100644 index 0000000..8aabe2e --- /dev/null +++ b/monitoring/dashboards/README @@ -0,0 +1,3 @@ +The files in this directory use a library for generating grafana dashboards +that at time of writing I'm not at liberty to share. Unfortunately, I don't have a better option. +So feel free to make changes, but I'll need to be the one to regenerate them. diff --git a/monitoring/dashboards/overview.jsonnet b/monitoring/dashboards/overview.jsonnet new file mode 100644 index 0000000..e30b331 --- /dev/null +++ b/monitoring/dashboards/overview.jsonnet @@ -0,0 +1,148 @@ +local grafana = import "grafana.libsonnet"; + +local services = [ + "restreamer", + "downloader", + "backfiller", + "cutter", + "thrimshim", + "sheetsync", +]; + +local service_status_table = { + local refId(n) = std.char(std.codepoint('A') + n), + type: "table", + targets: [ + { + expr: 'sum(up{job="%s"}) by (instance)' % services[i], + intervalFactor: 1, + format: "table", + refId: refId(i), + legendFormat: "", + instant: true, + } + for i in std.range(0, std.length(services) - 1) + ], + styles: [ + // hidden cols + { + unit: "short", + type: "hidden", + alias: "", + decimals: 2, + colors: [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)", + ], + colorMode: null, + pattern: name, + dateFormat: "YYYY-MM-DD HH:mm:ss", + thresholds: [], + mappingType: 1, + } + for name in ["__name__", "job", "Time"] + ] + [ + // service cols + { + unit: "short", + type: "string", + alias: services[i], + decimals: 2, + colors: [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)", + ], + colorMode: "cell", + pattern: "Value #%s" % refId(i), + dateFormat: "YYYY-MM-DD HH:mm:ss", + thresholds: [ + "0.5", + "0.5", + ], + mappingType: 1, + valueMaps: [ + { + value: "0", + text: "DOWN", + }, + { + value: "1", + text: "UP", + }, + ], + } for i in std.range(0, std.length(services) - 1) + ], + transform: "table", + pageSize: null, + showHeader: true, + columns: [], + scroll: true, + fontSize: "100%", + sort: { + col: 0, + desc: true, + }, + links: [], +}; + +grafana.dashboard({ + name: "Overview", + uid: "rjd405mn", + + rows: [ + + { + panels: [ + // First row - immediate status heads-up + [ + { + name: "Service Status by Node", + span: 2 * grafana.span.third, + custom: service_status_table, + }, + { + name: "Error log rate", + axis: {min: 0, label: "logs / sec"}, + display: "bars", + expressions: { + "{{job}} {{level}}({{module}}:{{function}})": ||| + sum(irate(log_count_total{level!="INFO"}[2m])) by (job, level, module, function) > 0 + |||, + }, + }, + ], + // Second row - core "business" metrics + [ + { + name: "Segments downloaded", + axis: {min: 0, label: "segments / sec"}, + expressions: { + "{{stream}}({{variant}}) live capture": + 'sum(rate(segments_downloaded_total[2m])) by (stream, variant)', + "{{stream}}({{variant}}) backfilled": + 'sum(rate(segments_backfilled_total[2m])) by (stream, variant)', + }, + }, + { + name: "Successful requests by endpoint", + axis: {min: 0, label: "requests / sec"}, + expressions: { + "{{method}} {{endpoint}}": + 'sum(rate(http_request_latency_all_count{status="200"}[2m])) by (endpoint, method)', + }, + }, + { + name: "Database events by state", + axis: {min: 0, label: "events"}, + tooltip: "Not implemented", // TODO + expressions: {"Not implemented": "0"}, + }, + ], + ], + }, + + ], + +})