chat_archiver: Add prometheus metrics

3 years ago · c1c1c11bce
parent c25d79e7c2
commit c1c1c11bce
1 changed files with 91 additions and 6 deletions
--- a/chat_archiver/chat_archiver/main.py
+++ b/chat_archiver/chat_archiver/main.py
@ -9,6 +9,7 @@ import signal
 import socket
 import time
 from calendar import timegm
+from collections import defaultdict
 from datetime import datetime
 from itertools import count
 from uuid import uuid4
@ -17,9 +18,11 @@ import gevent.event
 import gevent.queue

 from common import ensure_directory
+from common.stats import timed

 from girc import Client
 from monotonic import monotonic
+import prometheus_client as prom

 # How long each batch is
 BATCH_INTERVAL = 60
@ -49,6 +52,62 @@ MAX_SERVER_LAG = 30
 # by up to this amount before and after our best guess
 ESTIMATED_TIME_PADDING = 5

+messages_received = prom.Counter(
+	"messages_received"
+	"Number of chat messages recieved by the client. 'client' tag is per client instance.",
+	["channel", "client", "command"],
+)
+
+messages_ignored = prom.Counter(
+	"messages_ignored",
+	"Number of chat messages that were recieved but ignored due to not being on the allowlist.",
+	["channel", "client", "command"],
+)
+
+messages_written = prom.Counter(
+	"messages_written",
+	"Number of chat messages recieved and then written out to disk in a batch.",
+	["channel", "client", "command"],
+)
+
+batch_messages = prom.Histogram(
+	"batch_messages",
+	"Number of messages in batches written to disk",
+	["channel", "client"],
+	buckets=[0, 1, 4, 16, 64, 256, 1024],
+)
+
+# based on DB2021, an average PRIVMSG is about 600 bytes.
+# so since batch_messages goes up to 1024, batch_bytes should go up to ~ 600KB.
+# let's just call it 1MB.
+batch_bytes = prom.Histogram(
+	"batch_bytes",
+	"Size in bytes of batches written to disk",
+	["channel", "client"],
+	buckets=[0, 256, 1024, 4096, 16384, 65536, 262144, 1048576]
+)
+
+open_batches = prom.Gauge(
+	"open_batches",
+	"Number of batches that have at least one pending message not yet written to disk",
+	["channel", "client"],
+)
+
+server_lag = prom.Gauge(
+	"server_lag",
+	"Estimated time difference between server-side timestamps and local time, based on latest message",
+	["channel", "client"],
+)
+
+merge_pass_duration = prom.Histogram(
+	"merge_pass_duration",
+	"How long it took to run through all batches and merge any duplicates",
+)
+merge_pass_merges = prom.Histogram(
+	"merge_pass_merges",
+	"How many merges (times for which more than one batch existed) were done in a single merge pass",
+	buckets=[0, 1, 10, 100, 1000, 10000],
+)

 class Archiver(object):
 	def __init__(self, name, base_dir, channel, nick, oauth_token):
@ -70,6 +129,21 @@ class Archiver(object):
 		)
 		self.client.channel('#{}'.format(channel)).join()

+	def write_batch(self, batch_time, messages):
+		# wrapper around general write_batch() function
+		write_batch(
+			self.path, batch_time, messages,
+			size_histogram=batch_bytes.labels(channel=self.channel, client=id(self)),
+		)
+		batch_messages.labels(channel=self.channel, client=id(self)).observe(len(messages))
+		# incrementing a prom counter can be stupidly expensive, collect up per-command values
+		# so we can do them in one go
+		by_command = defaultdict(lambda: 0)
+		for message in messages:
+			by_command[message["command"]] += 1
+		for command, count in by_command.items():
+			messages_written.labels(channel=self.channel, client=id(self), command=command).inc(count)
+
 	def run(self):
 		# wait for twitch to send the initial ROOMSTATE for the room we've joined.
 		# everything preceeding this message is per-connection stuff we don't care about.
@ -95,6 +169,7 @@ class Archiver(object):
 		last_timestamped_message = None
 		# {batch time: [messages]}
 		batches = {}
+		open_batches.labels(channel=self.channel, client=id(self)).set_function(lambda: len(batches))

 		while not (self.stopping.is_set() and self.messages.empty()):
 			# wait until we either have a message, are stopping, or a batch can be closed
@ -112,7 +187,7 @@ class Archiver(object):
 			for batch_time, messages in list(batches.items()):
 				if now >= batch_time + BATCH_INTERVAL + MAX_SERVER_LAG:
 					del batches[batch_time]
-					write_batch(self.path, batch_time, messages)
+					self.write_batch(batch_time, messages)

 			# consume a message if any
 			try:
@ -122,6 +197,7 @@ class Archiver(object):

 			if message.command not in COMMANDS:
 				self.logger.info("Skipping non-whitelisted command: {}".format(message.command))
+				messages_ignored.labels(channel=self.channel, client=id(self), command=message.command).inc()
 				continue

 			self.logger.debug("Got message: {}".format(message))
@ -131,12 +207,14 @@ class Archiver(object):
 			}
 			data['receivers'] = {self.name: message.received_at}
 			self.logger.debug("Got message: {}".format(data))
+			messages_received.labels(channel=self.channel, client=id(self), command=message.command).inc()

 			if data['tags'] and 'tmi-sent-ts' in data['tags']:
 				# explicit server time is available
 				timestamp = int(data['tags']['tmi-sent-ts']) / 1000. # original is int ms
 				last_timestamped_message = message
 				last_server_time = timestamp
+				server_lag.labels(channel=self.channel, client=id(self)).set(time.time() - timestamp)
 				time_range = 0
 				self.logger.debug("Message has exact timestamp: {}".format(timestamp))
 				# check for any non-timestamped messages which we now know must have been
@ -178,7 +256,7 @@ class Archiver(object):

 		# Close any remaining batches
 		for batch_time, messages in batches.items():
-			write_batch(self.path, batch_time, messages)
+			self.write_batch(batch_time, messages)

 		self.client.wait_for_stop() # re-raise any errors
 		self.logger.info("Client stopped")
@ -187,9 +265,11 @@ class Archiver(object):
 		self.client.stop()


-def write_batch(path, batch_time, messages):
+def write_batch(path, batch_time, messages, size_histogram=None):
 	"""Batches are named PATH/YYYY-MM-DDTHH/MM:SS-HASH.json"""
 	output = (format_batch(messages) + '\n').encode('utf-8')
+	if size_histogram is not None:
+		size_histogram.observe(len(output))
 	hash = base64.b64encode(hashlib.sha256(output).digest(), b"-_").decode().rstrip("=")
 	hour = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H")
 	time = datetime.utcfromtimestamp(batch_time).strftime("%M:%S")
@ -251,6 +331,7 @@ def merge_all(path, interval=None, stopping=None):
 		stopping = gevent.event.Event()
 	while not stopping.is_set():
 		start = monotonic()
+		merges_done = 0
 		# loop until no changes
 		while True:
 			logging.debug("Scanning for merges")
@ -270,11 +351,14 @@ def merge_all(path, interval=None, stopping=None):
 					logging.info("Merging {} batches at time {}".format(count, timestamp))
 					batch_time = timegm(time.strptime(timestamp, "%Y-%m-%dT%H:%M:%S"))
 					merge_batch_files(path, batch_time)
+					merges_done += 1
+		duration = monotonic() - start
+		merge_pass_duration.observe(duration)
+		merge_pass_merges.observe(merges_done)
 		if interval is None:
 			# one-shot
 			break
-		next_run = start + interval
-		remaining = next_run - monotonic()
+		remaining = interval - duration
 		if remaining > 0:
 			logging.debug("Waiting {}s for next merge scan".format(remaining))
 			stopping.wait(remaining)
@ -325,6 +409,7 @@ def merge_batch_files(path, batch_time):
 		if batch_file not in written:
 			os.remove(batch_file)

+@timed("merge_messages", normalize=lambda _, left, right: len(left) + len(right))
 def merge_messages(left, right):
 	"""Merges two lists of messages into one merged list.
 	This operation should be a CRDT, ie. all the following hold:
@ -358,7 +443,7 @@ def merge_messages(left, right):
 			if k not in ("receivers", "time", "time_range")
 		):
 			receivers = a["receivers"] | b["receivers"]
-			# Error checkdng - make sure no receiver timestamps are being overwritten.
+			# Error checking - make sure no receiver timestamps are being overwritten.
 			# This would indicate we're merging two messages recieved at different times
 			# by the same recipient.
 			for k in receivers.keys():