chat-archiver: fixes

pull/300/head
Mike Lang 2 years ago committed by Mike Lang
parent c1c1c11bce
commit 05a989f67d

@ -19,4 +19,4 @@ COPY chat_archiver /tmp/archiver
RUN pip install /tmp/archiver && rm -r /tmp/archiver RUN pip install /tmp/archiver && rm -r /tmp/archiver
LABEL org.opencontainers.image.source https://github.com/ekimekim/wubloader LABEL org.opencontainers.image.source https://github.com/ekimekim/wubloader
ENTRYPOINT ["python", "-m", "chat_archiver", "--base-dir", "/mnt"] ENTRYPOINT ["python3", "-m", "chat_archiver", "--base-dir", "/mnt"]

@ -5,6 +5,8 @@ import hashlib
import json import json
import logging import logging
import os import os
import random
import string
import signal import signal
import socket import socket
import time import time
@ -53,7 +55,7 @@ MAX_SERVER_LAG = 30
ESTIMATED_TIME_PADDING = 5 ESTIMATED_TIME_PADDING = 5
messages_received = prom.Counter( messages_received = prom.Counter(
"messages_received" "messages_received",
"Number of chat messages recieved by the client. 'client' tag is per client instance.", "Number of chat messages recieved by the client. 'client' tag is per client instance.",
["channel", "client", "command"], ["channel", "client", "command"],
) )
@ -114,6 +116,7 @@ class Archiver(object):
self.logger = logging.getLogger(type(self).__name__).getChild(channel) self.logger = logging.getLogger(type(self).__name__).getChild(channel)
self.name = name self.name = name
self.messages = gevent.queue.Queue() self.messages = gevent.queue.Queue()
self.channel = channel
self.path = os.path.join(base_dir, channel, "chat") self.path = os.path.join(base_dir, channel, "chat")
self.stopping = gevent.event.Event() self.stopping = gevent.event.Event()
@ -265,6 +268,16 @@ class Archiver(object):
self.client.stop() self.client.stop()
def listdir(path):
"""as os.listdir but return [] if dir doesn't exist"""
try:
return os.listdir(path)
except OSError as e:
if e.errno != errno.ENOENT:
raise
return []
def write_batch(path, batch_time, messages, size_histogram=None): def write_batch(path, batch_time, messages, size_histogram=None):
"""Batches are named PATH/YYYY-MM-DDTHH/MM:SS-HASH.json""" """Batches are named PATH/YYYY-MM-DDTHH/MM:SS-HASH.json"""
output = (format_batch(messages) + '\n').encode('utf-8') output = (format_batch(messages) + '\n').encode('utf-8')
@ -276,7 +289,7 @@ def write_batch(path, batch_time, messages, size_histogram=None):
filename = os.path.join(hour, "{}-{}.json".format(time, hash)) filename = os.path.join(hour, "{}-{}.json".format(time, hash))
filepath = os.path.join(path, filename) filepath = os.path.join(path, filename)
if os.path.exists(filepath): if os.path.exists(filepath):
logging.info("Not writing batch {} - already exists.".format(filename)) logging.debug("Not writing batch {} - already exists.".format(filename))
else: else:
temppath = "{}.{}.temp".format(filepath, uuid4()) temppath = "{}.{}.temp".format(filepath, uuid4())
ensure_directory(filepath) ensure_directory(filepath)
@ -306,16 +319,9 @@ def get_batch_files(path, batch_time):
hour = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H") hour = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H")
time = datetime.utcfromtimestamp(batch_time).strftime("%M:%S") time = datetime.utcfromtimestamp(batch_time).strftime("%M:%S")
hourdir = os.path.join(path, hour) hourdir = os.path.join(path, hour)
# return [] if dir doesn't exist
try:
files = os.listdir(hourdir)
except OSError as e:
if e.errno != errno.ENOENT:
raise
return []
return [ return [
os.path.join(hourdir, name) os.path.join(hourdir, name)
for name in files for name in listdir(hourdir)
if name.startswith(time) and name.endswith(".json") if name.startswith(time) and name.endswith(".json")
] ]
@ -336,12 +342,12 @@ def merge_all(path, interval=None, stopping=None):
while True: while True:
logging.debug("Scanning for merges") logging.debug("Scanning for merges")
by_time = {} by_time = {}
for hour in os.listdir(path): for hour in listdir(path):
for name in os.listdir(os.path.join(path, hour)): for name in listdir(os.path.join(path, hour)):
if not name.endswith(".json"): if not name.endswith(".json"):
continue continue
time = "-".join(name.split("-")[:3]) min_sec = name.split("-")[0]
timestamp = "{}:{}".format(hour, time) timestamp = "{}:{}".format(hour, min_sec)
by_time[timestamp] = by_time.get(timestamp, 0) + 1 by_time[timestamp] = by_time.get(timestamp, 0) + 1
if not any(count > 1 for timestamp, count in by_time.items()): if not any(count > 1 for timestamp, count in by_time.items()):
logging.info("All batches are merged") logging.info("All batches are merged")
@ -508,10 +514,11 @@ def main(channel, nick, oauth_token_path, base_dir='/mnt', name=None, merge_inte
with open(oauth_token_path) as f: with open(oauth_token_path) as f:
oauth_token = f.read() oauth_token = f.read()
# To ensure uniqueness even if multiple instances are running on the same host, # To ensure uniqueness even if multiple instances are running on the same host,
# also include our pid # also include a random slug
if name is None: if name is None:
name = socket.gethostname() name = socket.gethostname()
name = "{}.{}".format(name, os.getpid()) slug = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(5))
name = "{}.{}".format(name, slug)
stopping = gevent.event.Event() stopping = gevent.event.Event()
gevent.signal_handler(signal.SIGTERM, stopping.set) gevent.signal_handler(signal.SIGTERM, stopping.set)

@ -1 +1 @@
Subproject commit 4b3b7172b9159ab7d6ee1d9b1ed6ebb9184f7911 Subproject commit 0a5934e675810b4b6dea97fe94a5be267b4025f9
Loading…
Cancel
Save