chat_archiver: Add prometheus metrics

3 years ago · c1c1c11bce
parent c25d79e7c2
commit c1c1c11bce
1 changed files with 91 additions and 6 deletions
--- a/chat_archiver/chat_archiver/main.py
+++ b/chat_archiver/chat_archiver/main.py
@ -9,6 +9,7 @@ import signal
 import socket
 import time
 from calendar import timegm
 from collections import defaultdict
 from datetime import datetime
 from itertools import count
 from uuid import uuid4
@ -17,9 +18,11 @@ import gevent.event
 import gevent.queue
 from common import ensure_directory
 from common.stats import timed
 from girc import Client
 from monotonic import monotonic
 import prometheus_client as prom
 # How long each batch is
 BATCH_INTERVAL = 60
@ -49,6 +52,62 @@ MAX_SERVER_LAG = 30
 # by up to this amount before and after our best guess
 ESTIMATED_TIME_PADDING = 5
 messages_received = prom.Counter(
 	"messages_received"
 	"Number of chat messages recieved by the client. 'client' tag is per client instance.",
 	["channel", "client", "command"],
 )
 messages_ignored = prom.Counter(
 	"messages_ignored",
 	"Number of chat messages that were recieved but ignored due to not being on the allowlist.",
 	["channel", "client", "command"],
 )
 messages_written = prom.Counter(
 	"messages_written",
 	"Number of chat messages recieved and then written out to disk in a batch.",
 	["channel", "client", "command"],
 )
 batch_messages = prom.Histogram(
 	"batch_messages",
 	"Number of messages in batches written to disk",
 	["channel", "client"],
 	buckets=[0, 1, 4, 16, 64, 256, 1024],
 )
 # based on DB2021, an average PRIVMSG is about 600 bytes.
 # so since batch_messages goes up to 1024, batch_bytes should go up to ~ 600KB.
 # let's just call it 1MB.
 batch_bytes = prom.Histogram(
 	"batch_bytes",
 	"Size in bytes of batches written to disk",
 	["channel", "client"],
 	buckets=[0, 256, 1024, 4096, 16384, 65536, 262144, 1048576]
 )
 open_batches = prom.Gauge(
 	"open_batches",
 	"Number of batches that have at least one pending message not yet written to disk",
 	["channel", "client"],
 )
 server_lag = prom.Gauge(
 	"server_lag",
 	"Estimated time difference between server-side timestamps and local time, based on latest message",
 	["channel", "client"],
 )
 merge_pass_duration = prom.Histogram(
 	"merge_pass_duration",
 	"How long it took to run through all batches and merge any duplicates",
 )
 merge_pass_merges = prom.Histogram(
 	"merge_pass_merges",
 	"How many merges (times for which more than one batch existed) were done in a single merge pass",
 	buckets=[0, 1, 10, 100, 1000, 10000],
 )
 class Archiver(object):
 	def __init__(self, name, base_dir, channel, nick, oauth_token):
@ -70,6 +129,21 @@ class Archiver(object):
 		)
 		self.client.channel('#{}'.format(channel)).join()
 	def write_batch(self, batch_time, messages):
 		# wrapper around general write_batch() function
 		write_batch(
 			self.path, batch_time, messages,
 			size_histogram=batch_bytes.labels(channel=self.channel, client=id(self)),
 		)
 		batch_messages.labels(channel=self.channel, client=id(self)).observe(len(messages))
 		# incrementing a prom counter can be stupidly expensive, collect up per-command values
 		# so we can do them in one go
 		by_command = defaultdict(lambda: 0)
 		for message in messages:
 			by_command[message["command"]] += 1
 		for command, count in by_command.items():
 			messages_written.labels(channel=self.channel, client=id(self), command=command).inc(count)
 	def run(self):
 		# wait for twitch to send the initial ROOMSTATE for the room we've joined.
 		# everything preceeding this message is per-connection stuff we don't care about.
@ -95,6 +169,7 @@ class Archiver(object):
 		last_timestamped_message = None
 		# {batch time: [messages]}
 		batches = {}
 		open_batches.labels(channel=self.channel, client=id(self)).set_function(lambda: len(batches))
 		while not (self.stopping.is_set() and self.messages.empty()):
 			# wait until we either have a message, are stopping, or a batch can be closed
@ -112,7 +187,7 @@ class Archiver(object):
 			for batch_time, messages in list(batches.items()):
 				if now >= batch_time + BATCH_INTERVAL + MAX_SERVER_LAG:
 					del batches[batch_time]
-					write_batch(self.path, batch_time, messages)
+					self.write_batch(batch_time, messages)
 			# consume a message if any
 			try:
@ -122,6 +197,7 @@ class Archiver(object):
 			if message.command not in COMMANDS:
 				self.logger.info("Skipping non-whitelisted command: {}".format(message.command))
 				messages_ignored.labels(channel=self.channel, client=id(self), command=message.command).inc()
 				continue
 			self.logger.debug("Got message: {}".format(message))
@ -131,12 +207,14 @@ class Archiver(object):
 			}
 			data['receivers'] = {self.name: message.received_at}
 			self.logger.debug("Got message: {}".format(data))
 			messages_received.labels(channel=self.channel, client=id(self), command=message.command).inc()
 			if data['tags'] and 'tmi-sent-ts' in data['tags']:
 				# explicit server time is available
 				timestamp = int(data['tags']['tmi-sent-ts']) / 1000. # original is int ms
 				last_timestamped_message = message
 				last_server_time = timestamp
 				server_lag.labels(channel=self.channel, client=id(self)).set(time.time() - timestamp)
 				time_range = 0
 				self.logger.debug("Message has exact timestamp: {}".format(timestamp))
 				# check for any non-timestamped messages which we now know must have been
@ -178,7 +256,7 @@ class Archiver(object):
 		# Close any remaining batches
 		for batch_time, messages in batches.items():
-			write_batch(self.path, batch_time, messages)
+			self.write_batch(batch_time, messages)
 		self.client.wait_for_stop() # re-raise any errors
 		self.logger.info("Client stopped")
@ -187,9 +265,11 @@ class Archiver(object):
 		self.client.stop()
-def write_batch(path, batch_time, messages):
+def write_batch(path, batch_time, messages, size_histogram=None):
 	"""Batches are named PATH/YYYY-MM-DDTHH/MM:SS-HASH.json"""
 	output = (format_batch(messages) + '\n').encode('utf-8')
 	if size_histogram is not None:
 		size_histogram.observe(len(output))
 	hash = base64.b64encode(hashlib.sha256(output).digest(), b"-_").decode().rstrip("=")
 	hour = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H")
 	time = datetime.utcfromtimestamp(batch_time).strftime("%M:%S")
@ -251,6 +331,7 @@ def merge_all(path, interval=None, stopping=None):
 		stopping = gevent.event.Event()
 	while not stopping.is_set():
 		start = monotonic()
 		merges_done = 0
 		# loop until no changes
 		while True:
 			logging.debug("Scanning for merges")
@ -270,11 +351,14 @@ def merge_all(path, interval=None, stopping=None):
 					logging.info("Merging {} batches at time {}".format(count, timestamp))
 					batch_time = timegm(time.strptime(timestamp, "%Y-%m-%dT%H:%M:%S"))
 					merge_batch_files(path, batch_time)
 					merges_done += 1
 		duration = monotonic() - start
 		merge_pass_duration.observe(duration)
 		merge_pass_merges.observe(merges_done)
 		if interval is None:
 			# one-shot
 			break
-		next_run = start + interval
+		remaining = interval - duration
 		remaining = next_run - monotonic()
 		if remaining > 0:
 			logging.debug("Waiting {}s for next merge scan".format(remaining))
 			stopping.wait(remaining)
@ -325,6 +409,7 @@ def merge_batch_files(path, batch_time):
 		if batch_file not in written:
 			os.remove(batch_file)
@timed("merge_messages", normalize=lambda _, left, right: len(left) + len(right))
 def merge_messages(left, right):
 	"""Merges two lists of messages into one merged list.
 	This operation should be a CRDT, ie. all the following hold:
@ -358,7 +443,7 @@ def merge_messages(left, right):
 			if k not in ("receivers", "time", "time_range")
 		):
 			receivers = a["receivers"] | b["receivers"]
-			# Error checkdng - make sure no receiver timestamps are being overwritten.
+			# Error checking - make sure no receiver timestamps are being overwritten.
 			# This would indicate we're merging two messages recieved at different times
 			# by the same recipient.
 			for k in receivers.keys():