diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4708421 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "chat_archiver/girc"] + path = chat_archiver/girc + url = https://github.com/ekimekim/girc diff --git a/chat_archiver/Dockerfile b/chat_archiver/Dockerfile new file mode 100644 index 0000000..23546a5 --- /dev/null +++ b/chat_archiver/Dockerfile @@ -0,0 +1,6 @@ +FROM quay.io/ekimekim/wubloader-downloader:32138bb +COPY girc /tmp/girc +RUN pip install /tmp/girc && rm -r /tmp/girc +COPY . /tmp/archiver +RUN pip install /tmp/archiver && rm -r /tmp/archiver +ENTRYPOINT ["python", "-m", "chat_archiver", "--base-dir", "/mnt"] diff --git a/chat_archiver/README b/chat_archiver/README new file mode 100644 index 0000000..6101387 --- /dev/null +++ b/chat_archiver/README @@ -0,0 +1,53 @@ +Chat archiver records messages from TMI (the twitch messaging system) +in a way that preserves as much context as possible, and allows multiple independently-recorded +streams to be combined to ensure nothing was missed. + +We store messages in newline-delimited JSON files, in timestamp order. +Each file covers one minute of messages. +These files are named by their timestamp + hash and merged with other files via a CRDT model. + CRDT means you have a merge operation (.) such that + (A.B).C == A.(B.C) (associative) + A.B == B.A (commutitive) + A.A == A (reflexive) + +We have a few different kinds of messages to deal with: + Messages with a unique id and timestamp + eg. PRIVMSG, USERNOTICE. + These are easy, as they have a canonical timestamp and can be trivially deduplicated by id. + Messages with an implied ordering + eg. ROOMSTATE, NOTICE, CLEARCHAT. + These messages arrive in what we assume is a consistent order on all clients, + but have no direct associated timestamp. We thus set a timestamp *range* for when + the message could have occurred from the server's perspective, between the last known + server timestamp (since it must be after that message) and the next received server timestamp + (since it must be before that message). We can set some reasonable timeout here in case + we don't receive another message within a short time window. + Messages with a timestamp range are ordered by their timestamp start. + This also governs which file they are in if their range overlaps two files. + Messages known to be out of order + This is specific to JOINs and PARTs. + Twitch documents that these may be delayed by up to 10sec. + So we follow the rules as per messages with implied ordering, + except we pad the possible start time by 10 seconds. + +How to merge two files + In general, if the same message (all non-receiver fields identical) is present in both files, + it is included once in the output. For messages with unique ids, this is all that's needed. + For messages without unique ids, we face the question of "is this the same message". + All the following must be true: + * All non-timestamp, non-receiver fields match + * Timestamp ranges overlap + If a message may match multiple messages on the other side with these rules, then + we pick one arbitrarily. + We then merge these messages, setting the timestamp range to the intersection of the inputs. + Literal edge cases: Timestamp ranges that span two files + It may be the case that we can match a message whose timestamp range overhangs file A + with a message near the start of file B. So whenever we are merging files, we need to + also consider the file before and the file after on our side. + In all cases when we merge two messages, we should merge the receiver timestamp field which maps + the receiver id to the timestamp it received the message. This preserves message provedence. + +All files are stored in newline-delimited, canonicalised JSON so we can use hashes to compare them. +It should always be true that merging B into A and merging A into B should produce identical files +with the same hash (effects of neighboring files notwithstanding - that just means more merges will +be needed in order to stabilize). diff --git a/chat_archiver/chat_archiver/__init__.py b/chat_archiver/chat_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chat_archiver/chat_archiver/__main__.py b/chat_archiver/chat_archiver/__main__.py new file mode 100644 index 0000000..f60c3fc --- /dev/null +++ b/chat_archiver/chat_archiver/__main__.py @@ -0,0 +1,15 @@ +import gevent.monkey +gevent.monkey.patch_all() + +import logging +import os + +import argh + +from .main import main + +LOG_FORMAT = "[%(asctime)s] %(levelname)8s %(name)s(%(module)s:%(lineno)d): %(message)s" + +level = os.environ.get('WUBLOADER_LOG_LEVEL', 'INFO').upper() +logging.basicConfig(level=level, format=LOG_FORMAT) +argh.dispatch_command(main) diff --git a/chat_archiver/chat_archiver/main.py b/chat_archiver/chat_archiver/main.py new file mode 100644 index 0000000..db9c3a5 --- /dev/null +++ b/chat_archiver/chat_archiver/main.py @@ -0,0 +1,209 @@ + +import base64 +import hashlib +import json +import logging +import os +import signal +import socket +import time +from datetime import datetime +from uuid import uuid4 + +import gevent.event +import gevent.queue + +from common import ensure_directory + +from girc import Client + + +class Archiver(object): + # These are known to arrive up to 10s after their actual time + DELAYED_COMMANDS = [ + "JOIN", + "PART", + ] + + COMMANDS = DELAYED_COMMANDS + [ + "PRIVMSG", + "CLEARCHAT", + "CLEARMSG", + "HOSTTARGET", + "NOTICE", + "ROOMSTATE", + "USERNOTICE", + "USERSTATE", + ] + + # How long each batch is + BATCH_INTERVAL = 60 + + # Assume we're never more than this amount of time behind the server time + # Worst case if too low: multiple output files for same batch that need merging later + MAX_SERVER_LAG = 30 + + # When guessing when a non-timestamped event occurred, pad the possible range + # by up to this amount before and after our best guess + ESTIMATED_TIME_PADDING = 5 + + def __init__(self, name, base_dir, channel, nick, oauth_token): + self.logger = logging.getLogger(type(self).__name__).getChild(channel) + self.name = name + self.messages = gevent.queue.Queue() + self.path = os.path.join(base_dir, channel) + + self.stopping = gevent.event.Event() + self.client = Client( + hostname='irc.chat.twitch.tv', + port=6697, + ssl=True, + nick=nick, + password=oauth_token, + twitch=True, + stop_handler=lambda c: self.stopping.set(), + ) + self.client.channel('#{}'.format(channel)).join() + + def run(self): + # wait for twitch to send the initial ROOMSTATE for the room we've joined. + # everything preceeding this message is per-connection stuff we don't care about. + # once we get it, we register the handler to put everything following onto the + # message queue. + @self.client.handler(command='ROOMSTATE', sync=True) + def register_real_handler(client, message): + self.client.handler(lambda c, m: self.messages.put(m), sync=True) + return True + + self.client.start() + + last_server_time = None + last_timestamped_message = None + # {batch time: [messages]} + batches = {} + + while not self.stopping.is_set(): + # wait until we either have a message, are stopping, or a batch can be closed + if batches: + next_batch_close = min(batches.keys()) + self.BATCH_INTERVAL + self.MAX_SERVER_LAG + self.logger.debug("Next batch close at {} (batch times: {})".format(next_batch_close, batches.keys())) + timeout = max(0, next_batch_close - time.time()) + else: + timeout = None + self.logger.debug("Waiting up to {} for message or stop".format(timeout)) + gevent.wait([gevent.spawn(self.messages.peek), self.stopping], count=1, timeout=timeout) + + # close any closable batches + now = time.time() + for batch_time, messages in list(batches.items()): + if now >= batch_time + self.BATCH_INTERVAL + self.MAX_SERVER_LAG: + del batches[batch_time] + self.write_batch(batch_time, messages) + + # consume a message if any + try: + message = self.messages.get(block=False) + except gevent.queue.Empty: + continue + + if message.command not in self.COMMANDS: + self.logger.info("Skipping non-whitelisted command: {}".format(message.command)) + continue + + self.logger.debug("Got message: {}".format(message)) + data = { + attr: getattr(message, attr) + for attr in ('command', 'params', 'sender', 'user', 'host', 'tags') + } + data['receivers'] = {self.name: message.received_at} + self.logger.debug("Got message: {}".format(data)) + + if data['tags'] and 'tmi-sent-ts' in data['tags']: + # explicit server time is available + timestamp = int(data['tags']['tmi-sent-ts']) / 1000. # original is int ms + last_timestamped_message = message + last_server_time = timestamp + time_range = 0 + self.logger.debug("Message has exact timestamp: {}".format(timestamp)) + # check for any non-timestamped messages which we now know must have been + # before this message. We need to check this batch and the previous. + batch_time = int(timestamp / self.BATCH_INTERVAL) * self.BATCH_INTERVAL + for batch in (batch_time, batch_time - self.BATCH_INTERVAL): + for msg in batches.get(batch, []): + time_between = timestamp - msg['time'] + if 0 < time_between < msg['time_range']: + self.logger.debug("Updating previous message {m[command]}@{m[time]} range {m[time_range]} -> {new}".format( + m=msg, new=time_between, + )) + msg['time_range'] = time_between + elif last_server_time is not None: + # estimate current server time based on time since last timestamped message + est_server_time = last_server_time + time.time() - last_timestamped_message.received_at + # pad either side of the estimated server time, use this as a baseline + timestamp = est_server_time - self.ESTIMATED_TIME_PADDING + time_range = 2 * self.ESTIMATED_TIME_PADDING + # if previously timestamped message falls within this range, we know this message + # came after it + timestamp = max(timestamp, last_server_time) + else: + # we have no idea what the server time is, so we guess as 2x the normal padding + # starting from local time. + timestamp = time.time() - 2 * self.ESTIMATED_TIME_PADDING + time_range = 3 * self.ESTIMATED_TIME_PADDING + + if data['command'] in self.DELAYED_COMMANDS: + # might have happened 10s sooner than otherwise indicated. + timestamp -= 10 + time_range += 10 + + self.logger.debug("Message time determined as {} + up to {}".format(timestamp, time_range)) + data['time'] = timestamp + data['time_range'] = time_range + batch_time = int(timestamp / self.BATCH_INTERVAL) * self.BATCH_INTERVAL + batches.setdefault(batch_time, []).append(data) + + # Close any remaining batches + for batch_time, messages in batches.items(): + self.write_batch(batch_time, messages) + + self.client.wait_for_stop() # re-raise any errors + + def write_batch(self, batch_time, messages): + # We need to take some care to have a consistent ordering and format here. + # We use a "canonicalised JSON" format, which is really just whatever the python encoder does, + # with compact separators. + messages = [ + (message, json.dumps(message, separators=(',', ':'))) + for message in messages + ] + # We sort by timestamp, then timestamp range, then if all else fails, lexiographically + # on the encoded representation. + messages.sort(key=lambda item: (item[0]['time'], item[0]['time_range'], item[1])) + output = ("\n".join(line for message, line in messages) + "\n").encode("utf-8") + hash = base64.b64encode(hashlib.sha256(output).digest(), b"-_").decode().rstrip("=") + time = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H:%M:%S") + filename = "{}-{}.json".format(time, hash) + filepath = os.path.join(self.path, filename) + temppath = "{}.{}.temp".format(filepath, uuid4()) + ensure_directory(filepath) + with open(temppath, 'wb') as f: + f.write(output) + os.rename(temppath, filepath) + self.logger.info("Wrote batch {}".format(filepath)) + + def stop(self): + self.client.stop() + + +def main(channel, nick, oauth_token_path, base_dir='/mnt'): + with open(oauth_token_path) as f: + oauth_token = f.read() + name = socket.gethostname() + + archiver = Archiver(name, base_dir, channel, nick, oauth_token) + + gevent.signal_handler(signal.SIGTERM, archiver.stop) + + logging.info("Starting") + archiver.run() + logging.info("Gracefully stopped") diff --git a/chat_archiver/girc b/chat_archiver/girc new file mode 160000 index 0000000..505ec0e --- /dev/null +++ b/chat_archiver/girc @@ -0,0 +1 @@ +Subproject commit 505ec0eb3c0a46adfad62383d9634fe971a430d5 diff --git a/chat_archiver/setup.py b/chat_archiver/setup.py new file mode 100644 index 0000000..80c6483 --- /dev/null +++ b/chat_archiver/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup, find_packages + +setup( + name='chat_archiver', + version='0.0.1', + author='Mike Lang', + author_email='mikelang3000@gmail.com', + description='', + packages=find_packages(), + install_requires=[ + 'argh', + 'gevent', + ], +) diff --git a/docker-compose.jsonnet b/docker-compose.jsonnet index d966d01..c1c4678 100644 --- a/docker-compose.jsonnet +++ b/docker-compose.jsonnet @@ -170,6 +170,14 @@ "youtube-manual", ], + chat_archiver:: { + image: "ghcr.io/ekimekim/wubloader-downloader:chat-archiver-hack-1", + channel: "desertbus", + user: "dbvideostriketeam", + logs_path: "%s/chat_logs" % $.segments_path, + token_path: "./chat_token.txt". + }, + // Extra options to pass via environment variables, // eg. log level, disabling stack sampling. env:: { @@ -441,6 +449,14 @@ [if $.db_standby then "command"]: ["/standby_setup.sh"], }, + [if $.chat_archiver != "null" then "chat_archiver"]: { + local c = $.chat_archiver, + image: c.image, + restart: "always", + command: [c.channel, c.user, "/token"], + volumes: ["%s:/mnt" % c.logs_path, "%s:/token" % c.token_path], + }, + }, }