You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
wubloader/monitoring/dashboards/overview.jsonnet

241 lines
6.6 KiB
Plaintext

local grafana = import "grafana.libsonnet";
// Map from service to regex of matching roles.
// Role explanations:
// replica: Just downloads and replicates segments
// local_edit: Also runs a local thrimbletrimmer for doing local cuts
// edit: Also runs cutter for doing uploads
// leader: Also runs things that only run in one place, eg. sheetsync
local roles_for_service = {
"restreamer": ".*",
"downloader": ".*",
"backfiller": ".*",
"segment_coverage": ".*",
"thrimshim": "leader|edit|local_edit",
"cutter": "leader|edit",
"sheetsync": "leader",
};
// List of services, to impart ordering
local services = [
"restreamer",
"downloader",
"backfiller",
"segment_coverage",
"thrimshim",
"cutter",
"sheetsync",
];
local service_status_table = {
local refId(n) = std.char(std.codepoint('A') + n),
type: "table",
targets: [
{
expr: 'sum(up{service="%s", role=~"%s"}) by (instance)' % [services[i], roles_for_service[services[i]]],
intervalFactor: 1,
format: "table",
refId: refId(i),
legendFormat: "",
instant: true,
}
for i in std.range(0, std.length(services) - 1)
],
styles: [
// hidden cols
{
unit: "short",
type: "hidden",
alias: "",
decimals: 2,
colors: [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)",
],
colorMode: null,
pattern: name,
dateFormat: "YYYY-MM-DD HH:mm:ss",
thresholds: [],
mappingType: 1,
}
for name in ["__name__", "service", "Time"]
] + [
// service cols
{
unit: "short",
type: "string",
alias: services[i],
decimals: 2,
colors: [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)",
],
colorMode: "cell",
pattern: "Value #%s" % refId(i),
dateFormat: "YYYY-MM-DD HH:mm:ss",
thresholds: [
"0.5",
"0.5",
],
mappingType: 1,
valueMaps: [
{
value: "0",
text: "DOWN",
},
{
value: "1",
text: "UP",
},
],
} for i in std.range(0, std.length(services) - 1)
],
transform: "table",
pageSize: null,
showHeader: true,
columns: [],
scroll: true,
fontSize: "100%",
sort: {col: 1, desc: false}, // sort by instance
links: [],
};
grafana.dashboard({
name: "Overview",
uid: "rjd405mn",
refresh: "30s",
rows: [
{
panels: [
// First row - immediate status heads-up
[
{
name: "Service Status by Node",
span: 2 * grafana.span.third,
custom: service_status_table,
},
{
name: "Error log rate",
axis: {min: 0, label: "logs / sec"},
display: "bars",
expressions: {
"{{instance}} {{service}} {{level}}({{module}}:{{function}})": |||
sum(irate(log_count_total{level!="INFO"}[2m])) by (instance, service, level, module, function) > 0
|||,
},
},
],
// Second row - core "business" metrics
[
{
name: "Segments downloaded",
axis: {min: 0, label: "segments / sec"},
expressions: {
"{{channel}}({{quality}}) live capture":
'sum(rate(segments_downloaded_total[2m])) by (channel, quality)',
"{{channel}}({{quality}}) backfilled":
'sum(rate(segments_backfilled_total[2m])) by (channel, quality)',
},
},
{
name: "Successful requests by endpoint",
axis: {min: 0, label: "requests / sec"},
expressions: {
"{{method}} {{endpoint}}":
'sum(rate(http_request_latency_all_count{status="200"}[2m])) by (endpoint, method)',
},
},
{
name: "Database events by state",
axis: {min: 0, label: "events"},
stack: true,
tooltip: "Does not include UNEDITED or DONE events",
expressions: {
"{{state}}": |||
sum(event_counts{state!="UNEDITED", state!="DONE"}) by (state)
|||,
},
},
],
// Third row - process-level health
[
{
name: "CPU usage",
axis: {min: 0, label: "cores", format: grafana.formats.percent},
expressions: {
"{{instance}} {{service}}": |||
sum by (instance, service) (
rate(process_cpu_seconds_total[2m])
)
|||
},
},
{
name: "Memory usage (RSS)",
axis: {min: 0, format: grafana.formats.bytes},
expressions: {
"{{instance}} {{service}}": "process_resident_memory_bytes",
},
},
{
name: "Process restarts",
axis: {min: 0, label: "restarts within last minute"},
tooltip: "Multiple restarts within 15sec will be missed, and only counted as one.",
expressions: {
"{{instance}} {{service}}": "changes(process_start_time_seconds[1m])",
},
},
],
],
},
{
name: "Downloader",
panels: [
{
name: "Segments downloaded by node",
axis: {min: 0, label: "segments / sec"},
expressions: {
"{{instance}} {{channel}}({{quality}})":
'sum(rate(segments_downloaded_total[2m])) by (instance, channel, quality)',
},
},
{
name: "Downloader stream delay by node",
tooltip: "Time between the latest downloaded segment's timestamp and current time",
axis: {min: 0, format: grafana.formats.time},
expressions: {
"{{instance}} {{channel}}({{quality}})":
// Ignore series where we're no longer fetching segments,
// as they just show that it's been a long time since the last segment.
|||
time() - max(latest_segment) by (instance, channel, quality)
and sum(irate(segments_downloaded_total[2m])) by (instance, channel, quality) > 0
|||,
},
},
],
},
{
name: "Backfiller",
panels: [
{
name: "Backfill by node pair",
axis: {min: 0, label: "segments / sec"},
expressions: {
"{{remote}} -> {{instance}}":
'sum(rate(segments_backfilled_total[2m])) by (remote, instance)',
},
},
],
},
],
})