diff --git a/monitoring/dashboards/overview.jsonnet b/monitoring/dashboards/overview.jsonnet index 3c40029..50a098d 100644 --- a/monitoring/dashboards/overview.jsonnet +++ b/monitoring/dashboards/overview.jsonnet @@ -102,11 +102,22 @@ local service_status_table = { links: [], }; +local labels = { + labels: 'instance=~"$instance"' +}; + grafana.dashboard({ name: "Overview", uid: "rjd405mn", refresh: "30s", + templates: [ + { + name: "instance", + query: 'label_values(up, instance)' + }, + ], + rows: [ { @@ -124,8 +135,8 @@ grafana.dashboard({ display: "bars", expressions: { "{{instance}} {{service}} {{level}}({{module}}:{{function}})": ||| - sum(irate(log_count_total{level!="INFO"}[2m])) by (instance, service, level, module, function) > 0 - |||, + sum(irate(log_count_total{level!="INFO", %(labels)s}[2m])) by (instance, service, level, module, function) > 0 + ||| % labels, }, }, ], @@ -136,9 +147,9 @@ grafana.dashboard({ axis: {min: 0, label: "segments / sec"}, expressions: { "{{channel}}({{quality}}) live capture": - 'sum(rate(segments_downloaded_total[2m])) by (channel, quality)', + 'sum(rate(segments_downloaded_total{%(labels)s}[2m])) by (channel, quality)' % labels, "{{channel}}({{quality}}) backfilled": - 'sum(rate(segments_backfilled_total[2m])) by (channel, quality)', + 'sum(rate(segments_backfilled_total{%(labels)s}[2m])) by (channel, quality)' % labels, }, }, { @@ -146,7 +157,7 @@ grafana.dashboard({ axis: {min: 0, label: "requests / sec"}, expressions: { "{{method}} {{endpoint}}": - 'sum(rate(http_request_latency_all_count{status="200"}[2m])) by (endpoint, method)', + 'sum(rate(http_request_latency_all_count{status="200", %(labels)s}[2m])) by (endpoint, method)' % labels, }, }, { @@ -156,8 +167,8 @@ grafana.dashboard({ tooltip: "Does not include UNEDITED or DONE events", expressions: { "{{state}}": ||| - sum(event_counts{state!="UNEDITED", state!="DONE"}) by (state) - |||, + sum(event_counts{state!="UNEDITED", state!="DONE", %(labels)s}) by (state) + ||| % labels, }, }, ], @@ -169,16 +180,16 @@ grafana.dashboard({ expressions: { "{{instance}} {{service}}": ||| sum by (instance, service) ( - rate(process_cpu_seconds_total[2m]) + rate(process_cpu_seconds_total{%(labels)s}[2m]) ) - ||| + ||| % labels, }, }, { name: "Memory usage (RSS)", axis: {min: 0, format: grafana.formats.bytes}, expressions: { - "{{instance}} {{service}}": "process_resident_memory_bytes", + "{{instance}} {{service}}": "process_resident_memory_bytes{%(labels)s}" % labels, }, }, { @@ -186,7 +197,7 @@ grafana.dashboard({ axis: {min: 0, label: "restarts within last minute"}, tooltip: "Multiple restarts within 15sec will be missed, and only counted as one.", expressions: { - "{{instance}} {{service}}": "changes(process_start_time_seconds[1m])", + "{{instance}} {{service}}": "changes(process_start_time_seconds{%(labels)s}[1m])" % labels, }, }, ], @@ -201,7 +212,7 @@ grafana.dashboard({ axis: {min: 0, label: "segments / sec"}, expressions: { "{{instance}} {{channel}}({{quality}})": - 'sum(rate(segments_downloaded_total[2m])) by (instance, channel, quality)', + 'sum(rate(segments_downloaded_total{%(labels)s}[2m])) by (instance, channel, quality)' % labels, }, }, { @@ -213,9 +224,9 @@ grafana.dashboard({ // Ignore series where we're no longer fetching segments, // as they just show that it's been a long time since the last segment. ||| - time() - max(latest_segment) by (instance, channel, quality) - and sum(irate(segments_downloaded_total[2m])) by (instance, channel, quality) > 0 - |||, + time() - max(latest_segment{%(labels)s}) by (instance, channel, quality) + and sum(irate(segments_downloaded_total{%(labels)s}[2m])) by (instance, channel, quality) > 0 + ||| % labels, }, }, ], @@ -229,7 +240,7 @@ grafana.dashboard({ axis: {min: 0, label: "segments / sec"}, expressions: { "{{remote}} -> {{instance}}": - 'sum(rate(segments_backfilled_total[2m])) by (remote, instance)', + 'sum(rate(segments_backfilled_total{%(labels)s}[2m])) by (remote, instance)' % labels, }, }, ],