chat_archiver: Split files into directories by hour

matching how we handle video files
pull/300/head
Mike Lang 2 years ago committed by Mike Lang
parent 1d626738bd
commit 05ddd39504

@ -1,5 +1,6 @@
import base64 import base64
import errno
import hashlib import hashlib
import json import json
import logging import logging
@ -53,7 +54,7 @@ class Archiver(object):
self.logger = logging.getLogger(type(self).__name__).getChild(channel) self.logger = logging.getLogger(type(self).__name__).getChild(channel)
self.name = name self.name = name
self.messages = gevent.queue.Queue() self.messages = gevent.queue.Queue()
self.path = os.path.join(base_dir, channel) self.path = os.path.join(base_dir, channel, "chat")
self.stopping = gevent.event.Event() self.stopping = gevent.event.Event()
self.got_reconnect = gevent.event.Event() self.got_reconnect = gevent.event.Event()
@ -188,8 +189,9 @@ class Archiver(object):
def write_batch(path, batch_time, messages): def write_batch(path, batch_time, messages):
output = (format_batch(messages) + '\n').encode('utf-8') output = (format_batch(messages) + '\n').encode('utf-8')
hash = base64.b64encode(hashlib.sha256(output).digest(), b"-_").decode().rstrip("=") hash = base64.b64encode(hashlib.sha256(output).digest(), b"-_").decode().rstrip("=")
time = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H:%M:%S") hour = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H")
filename = "{}-{}.json".format(time, hash) time = datetime.utcfromtimestamp(batch_time).strftime("%M:%S")
filename = os.path.join(hour, "{}-{}.json".format(time, hash))
filepath = os.path.join(path, filename) filepath = os.path.join(path, filename)
if os.path.exists(filepath): if os.path.exists(filepath):
logging.info("Not writing batch {} - already exists.".format(filename)) logging.info("Not writing batch {} - already exists.".format(filename))
@ -219,10 +221,19 @@ def format_batch(messages):
def get_batch_files(path, batch_time): def get_batch_files(path, batch_time):
"""Returns list of batch filepaths for a given batch time""" """Returns list of batch filepaths for a given batch time"""
time = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H:%M:%S-") hour = datetime.utcfromtimestamp(batch_time).strftime("%Y-%m-%dT%H")
time = datetime.utcfromtimestamp(batch_time).strftime("%M:%S")
hourdir = os.path.join(path, hour)
# return [] if dir doesn't exist
try:
files = os.listdir(hourdir)
except OSError as e:
if e.errno != errno.ENOENT:
raise
return []
return [ return [
os.path.join(path, name) os.path.join(hourdir, name)
for name in os.listdir(path) for name in files
if name.startswith(time) and name.endswith(".json") if name.startswith(time) and name.endswith(".json")
] ]
@ -232,10 +243,12 @@ def merge_all(path):
Returns once it finds no duplicate files.""" Returns once it finds no duplicate files."""
while True: while True:
by_time = {} by_time = {}
for name in os.listdir(path): for hour in os.listdir(path):
if not name.endswith(".json"): for name in os.listdir(os.path.join(path, hour)):
continue if not name.endswith(".json"):
timestamp = "-".join(name.split("-")[:3]) continue
time = "-".join(name.split("-")[:3])
timestamp = "{}:{}".format(hour, time)
by_time[timestamp] = by_time.get(timestamp, 0) + 1 by_time[timestamp] = by_time.get(timestamp, 0) + 1
if not any(count > 1 for timestamp, count in by_time.items()): if not any(count > 1 for timestamp, count in by_time.items()):
break break

Loading…
Cancel
Save