mirror of https://github.com/ekimekim/wubloader
chat-archiver: Early work and basic archival
parent
6b9d8ab785
commit
0756539b85
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "chat_archiver/girc"]
|
||||||
|
path = chat_archiver/girc
|
||||||
|
url = https://github.com/ekimekim/girc
|
@ -0,0 +1,6 @@
|
|||||||
|
FROM quay.io/ekimekim/wubloader-downloader:32138bb
|
||||||
|
COPY girc /tmp/girc
|
||||||
|
RUN pip install /tmp/girc && rm -r /tmp/girc
|
||||||
|
COPY . /tmp/archiver
|
||||||
|
RUN pip install /tmp/archiver && rm -r /tmp/archiver
|
||||||
|
ENTRYPOINT ["python", "-m", "chat_archiver", "--base-dir", "/mnt"]
|
@ -0,0 +1,53 @@
|
|||||||
|
Chat archiver records messages from TMI (the twitch messaging system)
|
||||||
|
in a way that preserves as much context as possible, and allows multiple independently-recorded
|
||||||
|
streams to be combined to ensure nothing was missed.
|
||||||
|
|
||||||
|
We store messages in newline-delimited JSON files, in timestamp order.
|
||||||
|
Each file covers one minute of messages.
|
||||||
|
These files are named by their timestamp + hash and merged with other files via a CRDT model.
|
||||||
|
CRDT means you have a merge operation (.) such that
|
||||||
|
(A.B).C == A.(B.C) (associative)
|
||||||
|
A.B == B.A (commutitive)
|
||||||
|
A.A == A (reflexive)
|
||||||
|
|
||||||
|
We have a few different kinds of messages to deal with:
|
||||||
|
Messages with a unique id and timestamp
|
||||||
|
eg. PRIVMSG, USERNOTICE.
|
||||||
|
These are easy, as they have a canonical timestamp and can be trivially deduplicated by id.
|
||||||
|
Messages with an implied ordering
|
||||||
|
eg. ROOMSTATE, NOTICE, CLEARCHAT.
|
||||||
|
These messages arrive in what we assume is a consistent order on all clients,
|
||||||
|
but have no direct associated timestamp. We thus set a timestamp *range* for when
|
||||||
|
the message could have occurred from the server's perspective, between the last known
|
||||||
|
server timestamp (since it must be after that message) and the next received server timestamp
|
||||||
|
(since it must be before that message). We can set some reasonable timeout here in case
|
||||||
|
we don't receive another message within a short time window.
|
||||||
|
Messages with a timestamp range are ordered by their timestamp start.
|
||||||
|
This also governs which file they are in if their range overlaps two files.
|
||||||
|
Messages known to be out of order
|
||||||
|
This is specific to JOINs and PARTs.
|
||||||
|
Twitch documents that these may be delayed by up to 10sec.
|
||||||
|
So we follow the rules as per messages with implied ordering,
|
||||||
|
except we pad the possible start time by 10 seconds.
|
||||||
|
|
||||||
|
How to merge two files
|
||||||
|
In general, if the same message (all non-receiver fields identical) is present in both files,
|
||||||
|
it is included once in the output. For messages with unique ids, this is all that's needed.
|
||||||
|
For messages without unique ids, we face the question of "is this the same message".
|
||||||
|
All the following must be true:
|
||||||
|
* All non-timestamp, non-receiver fields match
|
||||||
|
* Timestamp ranges overlap
|
||||||
|
If a message may match multiple messages on the other side with these rules, then
|
||||||
|
we pick one arbitrarily.
|
||||||
|
We then merge these messages, setting the timestamp range to the intersection of the inputs.
|
||||||
|
Literal edge cases: Timestamp ranges that span two files
|
||||||
|
It may be the case that we can match a message whose timestamp range overhangs file A
|
||||||
|
with a message near the start of file B. So whenever we are merging files, we need to
|
||||||
|
also consider the file before and the file after on our side.
|
||||||
|
In all cases when we merge two messages, we should merge the receiver timestamp field which maps
|
||||||
|
the receiver id to the timestamp it received the message. This preserves message provedence.
|
||||||
|
|
||||||
|
All files are stored in newline-delimited, canonicalised JSON so we can use hashes to compare them.
|
||||||
|
It should always be true that merging B into A and merging A into B should produce identical files
|
||||||
|
with the same hash (effects of neighboring files notwithstanding - that just means more merges will
|
||||||
|
be needed in order to stabilize).
|
@ -0,0 +1,15 @@
|
|||||||
|
import gevent.monkey
|
||||||
|
gevent.monkey.patch_all()
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
import argh
|
||||||
|
|
||||||
|
from .main import main
|
||||||
|
|
||||||
|
LOG_FORMAT = "[%(asctime)s] %(levelname)8s %(name)s(%(module)s:%(lineno)d): %(message)s"
|
||||||
|
|
||||||
|
level = os.environ.get('WUBLOADER_LOG_LEVEL', 'INFO').upper()
|
||||||
|
logging.basicConfig(level=level, format=LOG_FORMAT)
|
||||||
|
argh.dispatch_command(main)
|
@ -0,0 +1,209 @@
|
|||||||
|
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import socket
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
import gevent.event
|
||||||
|
import gevent.queue
|
||||||
|
|
||||||
|
from common import ensure_directory
|
||||||
|
|
||||||
|
from girc import Client
|
||||||
|
|
||||||
|
|
||||||
|
class Archiver(object):
|
||||||
|
# These are known to arrive up to 10s after their actual time
|
||||||
|
DELAYED_COMMANDS = [
|
||||||
|
"JOIN",
|
||||||
|
"PART",
|
||||||
|
]
|
||||||
|
|
||||||
|
COMMANDS = DELAYED_COMMANDS + [
|
||||||
|
"PRIVMSG",
|
||||||
|
"CLEARCHAT",
|
||||||
|
"CLEARMSG",
|
||||||
|
"HOSTTARGET",
|
||||||
|
"NOTICE",
|
||||||
|
"ROOMSTATE",
|
||||||
|
"USERNOTICE",
|
||||||
|
"USERSTATE",
|
||||||
|
]
|
||||||
|
|
||||||
|
# How long each batch is
|
||||||
|
BATCH_INTERVAL = 60
|
||||||
|
|
||||||
|
# Assume we're never more than this amount of time behind the server time
|
||||||
|
# Worst case if too low: multiple output files for same batch that need merging later
|
||||||
|
MAX_SERVER_LAG = 30
|
||||||
|
|
||||||
|
# When guessing when a non-timestamped event occurred, pad the possible range
|
||||||
|
# by up to this amount before and after our best guess
|
||||||
|
ESTIMATED_TIME_PADDING = 5
|
||||||
|
|
||||||
|
def __init__(self, name, base_dir, channel, nick, oauth_token):
|
||||||
|
self.logger = logging.getLogger(type(self).__name__).getChild(channel)
|
||||||
|
self.name = name
|
||||||
|
self.messages = gevent.queue.Queue()
|
||||||
|
self.path = os.path.join(base_dir, channel)
|
||||||
|
|
||||||
|
self.stopping = gevent.event.Event()
|
||||||
|
self.client = Client(
|
||||||
|
hostname='irc.chat.twitch.tv',
|
||||||
|
port=6697,
|
||||||
|
ssl=True,
|
||||||
|
nick=nick,
|
||||||
|
password=oauth_token,
|
||||||
|
twitch=True,
|
||||||
|
stop_handler=lambda c: self.stopping.set(),
|
||||||
|
)
|
||||||
|
self.client.channel('#{}'.format(channel)).join()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
# wait for twitch to send the initial ROOMSTATE for the room we've joined.
|
||||||
|
# everything preceeding this message is per-connection stuff we don't care about.
|
||||||
|
# once we get it, we register the handler to put everything following onto the
|
||||||
|
# message queue.
|
||||||
|
@self.client.handler(command='ROOMSTATE', sync=True)
|
||||||
|
def register_real_handler(client, message):
|
||||||
|
self.client.handler(lambda c, m: self.messages.put(m), sync=True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
self.client.start()
|
||||||
|
|
||||||
|
last_server_time = None
|
||||||
|
last_timestamped_message = None
|
||||||
|
# {batch time: [messages]}
|
||||||
|
batches = {}
|
||||||
|
|
||||||
|
while not self.stopping.is_set():
|
||||||
|
# wait until we either have a message, are stopping, or a batch can be closed
|
||||||
|
if batches:
|
||||||
|
next_batch_close = min(batches.keys()) + self.BATCH_INTERVAL + self.MAX_SERVER_LAG
|
||||||
|
self.logger.debug("Next batch close at {} (batch times: {})".format(next_batch_close, batches.keys()))
|
||||||
|
timeout = max(0, next_batch_close - time.time())
|
||||||
|
else:
|
||||||
|
timeout = None
|
||||||
|
self.logger.debug("Waiting up to {} for message or stop".format(timeout))
|
||||||
|
gevent.wait([gevent.spawn(self.messages.peek), self.stopping], count=1, timeout=timeout)
|
||||||
|
|
||||||
|
# close any closable batches
|
||||||
|
now = time.time()
|
||||||
|
for batch_time, messages in list(batches.items()):
|
||||||
|
if now >= batch_time + self.BATCH_INTERVAL + self.MAX_SERVER_LAG:
|
||||||
|
del batches[batch_time]
|
||||||
|
self.write_batch(batch_time, messages)
|
||||||
|
|
||||||
|
# consume a message if any
|
||||||
|
try:
|
||||||
|
message = self.messages.get(block=False)
|
||||||
|
except gevent.queue.Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if message.command not in self.COMMANDS:
|
||||||
|
self.logger.info("Skipping non-whitelisted command: {}".format(message.command))
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.logger.debug("Got message: {}".format(message))
|
||||||
|
data = {
|
||||||
|
attr: getattr(message, attr)
|
||||||
|
for attr in ('command', 'params', 'sender', 'user', 'host', 'tags')
|
||||||
|
}
|
||||||
|
data['receivers'] = {self.name: message.received_at}
|
||||||
|
self.logger.debug("Got message: {}".format(data))
|
||||||
|
|
||||||
|
if data['tags'] and 'tmi-sent-ts' in data['tags']:
|
||||||
|
# explicit server time is available
|
||||||
|
timestamp = int(data['tags']['tmi-sent-ts']) / 1000. # original is int ms
|
||||||
|
last_timestamped_message = message
|
||||||
|
last_server_time = timestamp
|
||||||
|
time_range = 0
|
||||||
|
self.logger.debug("Message has exact timestamp: {}".format(timestamp))
|
||||||
|
# check for any non-timestamped messages which we now know must have been
|
||||||
|
# before this message. We need to check this batch and the previous.
|
||||||
|
batch_time = int(timestamp / self.BATCH_INTERVAL) * self.BATCH_INTERVAL
|
||||||
|
for batch in (batch_time, batch_time - self.BATCH_INTERVAL):
|
||||||
|
for msg in batches.get(batch, []):
|
||||||
|
time_between = timestamp - msg['time']
|
||||||
|
if 0 < time_between < msg['time_range']:
|
||||||
|
self.logger.debug("Updating previous message {m[command]}@{m[time]} range {m[time_range]} -> {new}".format(
|
||||||
|
m=msg, new=time_between,
|
||||||
|
))
|
||||||
|
msg['time_range'] = time_between
|
||||||
|
elif last_server_time is not None:
|
||||||
|
# estimate current server time based on time since last timestamped message
|
||||||
|
est_server_time = last_server_time + time.time() - last_timestamped_message.received_at
|
||||||
|
# pad either side of the estimated server time, use this as a baseline
|
||||||
|
timestamp = est_server_time - self.ESTIMATED_TIME_PADDING
|
||||||
|
time_range = 2 * self.ESTIMATED_TIME_PADDING
|
||||||
|
# if previously timestamped message falls within this range, we know this message
|
||||||
|
# came after it
|
||||||
|
timestamp = max(timestamp, last_server_time)
|
||||||
|
else:
|
||||||
|
# we have no idea what the server time is, so we guess as 2x the normal padding
|
||||||
|
# starting from local time.
|
||||||
|
timestamp = time.time() - 2 * self.ESTIMATED_TIME_PADDING
|
||||||
|
time_range = 3 * self.ESTIMATED_TIME_PADDING
|
||||||
|
|
||||||
|
if data['command'] in self.DELAYED_COMMANDS:
|
||||||
|
# might have happened 10s sooner than otherwise indicated.
|
||||||
|
timestamp -= 10
|
||||||
|
time_range += 10
|
||||||
|
|
||||||
|
self.logger.debug("Message time determined as {} + up to {}".format(timestamp, time_range))
|
||||||
|
data['time'] = timestamp
|
||||||
|
data['time_range'] = time_range
|
||||||
|
batch_time = int(timestamp / self.BATCH_INTERVAL) * self.BATCH_INTERVAL
|
||||||
|
batches.setdefault(batch_time, []).append(data)
|
||||||
|
|
||||||
|
# Close any remaining batches
|
||||||
|
for batch_time, messages in batches.items():
|
||||||
|
self.write_batch(batch_time, messages)
|
||||||
|
|
||||||
|
self.client.wait_for_stop() # re-raise any errors
|
||||||
|
|
||||||
|
def write_batch(self, batch_time, messages):
|
||||||
|
# We need to take some care to have a consistent ordering and format here.
|
||||||
|
# We use a "canonicalised JSON" format, which is really just whatever the python encoder does,
|
||||||
|
# with compact separators.
|
||||||
|
messages = [
|
||||||
|
(message, json.dumps(message, separators=(',', ':')))
|
||||||
|
for message in messages
|
||||||
|
]
|
||||||
|
# We sort by timestamp, then timestamp range, then if all else fails, lexiographically
|
||||||
|
# on the encoded representation.
|
||||||
|
messages.sort(key=lambda item: (item[0]['time'], item[0]['time_range'], item[1]))
|
||||||
|
output = ("\n".join(line for message, line in messages) + "\n").encode("utf-8")
|
||||||
|
hash = base64.b64encode(hashlib.sha256(output).digest(), b"-_").decode().rstrip("=")
|
||||||
|
time = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
filename = "{}-{}.json".format(time, hash)
|
||||||
|
filepath = os.path.join(self.path, filename)
|
||||||
|
temppath = "{}.{}.temp".format(filepath, uuid4())
|
||||||
|
ensure_directory(filepath)
|
||||||
|
with open(temppath, 'wb') as f:
|
||||||
|
f.write(output)
|
||||||
|
os.rename(temppath, filepath)
|
||||||
|
self.logger.info("Wrote batch {}".format(filepath))
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.client.stop()
|
||||||
|
|
||||||
|
|
||||||
|
def main(channel, nick, oauth_token_path, base_dir='/mnt'):
|
||||||
|
with open(oauth_token_path) as f:
|
||||||
|
oauth_token = f.read()
|
||||||
|
name = socket.gethostname()
|
||||||
|
|
||||||
|
archiver = Archiver(name, base_dir, channel, nick, oauth_token)
|
||||||
|
|
||||||
|
gevent.signal_handler(signal.SIGTERM, archiver.stop)
|
||||||
|
|
||||||
|
logging.info("Starting")
|
||||||
|
archiver.run()
|
||||||
|
logging.info("Gracefully stopped")
|
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 505ec0eb3c0a46adfad62383d9634fe971a430d5
|
@ -0,0 +1,14 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='chat_archiver',
|
||||||
|
version='0.0.1',
|
||||||
|
author='Mike Lang',
|
||||||
|
author_email='mikelang3000@gmail.com',
|
||||||
|
description='',
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=[
|
||||||
|
'argh',
|
||||||
|
'gevent',
|
||||||
|
],
|
||||||
|
)
|
Loading…
Reference in New Issue