mirror of https://github.com/ekimekim/wubloader
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
4.0 KiB
Python
133 lines
4.0 KiB
Python
import json
|
|
import time
|
|
import hashlib
|
|
|
|
# Calculates intersection of time range of both messages, or None if they don't overlap
|
|
def overlap(a, b):
|
|
range_start = max(a['time'], b['time'])
|
|
range_end = min(a['time'] + a['time_range'], b['time'] + b['time_range'])
|
|
if range_end < range_start:
|
|
return None
|
|
return range_start, range_end - range_start
|
|
|
|
# Returns merged message if two messages are compatible with being the same message,
|
|
# or else None.
|
|
def merge_message(a, b):
|
|
o = overlap(a, b)
|
|
if o and all(
|
|
a.get(k) == b.get(k)
|
|
for k in set(a.keys()) | set(b.keys())
|
|
if k not in ("receivers", "time", "time_range")
|
|
):
|
|
receivers = a["receivers"] | b["receivers"]
|
|
# Error checking - make sure no receiver timestamps are being overwritten.
|
|
# This would indicate we're merging two messages recieved at different times
|
|
# by the same recipient.
|
|
for k in receivers.keys():
|
|
for old in (a, b):
|
|
if k in old and old[k] != receivers[k]:
|
|
raise ValueError(f"Merge would merge two messages with different recipient timestamps: {a}, {b}")
|
|
return a | {
|
|
"time": o[0],
|
|
"time_range": o[1],
|
|
"receivers": receivers,
|
|
}
|
|
return None
|
|
|
|
def merge_messages(*batches):
|
|
batches = [b for b in batches if len(b) > 0]
|
|
|
|
result = []
|
|
while len(batches) > 1:
|
|
# Shortcut for non-overlapping time ranges - if one batch has messages that end before
|
|
# the start of any other batch, take those messages directly without trying to match them.
|
|
|
|
# Find batches with the earliest and second earliest messages
|
|
first = batches[0]
|
|
second = batches[1]
|
|
if first[0]["time"] > second[0]["time"]:
|
|
first, second = second, first
|
|
for batch in batches[2:]:
|
|
if batch[0]["time"] < first[0]["time"]:
|
|
first, second = batch, first
|
|
elif batch[0]["time"] < second[0]["time"]:
|
|
second = batch
|
|
|
|
# Find messages in first batch that end before the start of second batch,
|
|
# and copy them directly to result.
|
|
cutoff = second[0]["time"]
|
|
while first and first[0]["time"] + first[0]["time_range"] < cutoff:
|
|
result.append(first[0])
|
|
first.pop(0)
|
|
|
|
# If first now overlaps with second, move on to try to find messages to merge.
|
|
# If no overlap (either first is exhausted, or second now starts sooner than first)
|
|
# then just start again from the top and look for more non-overapping ranges.
|
|
if not first:
|
|
batches.remove(first)
|
|
continue
|
|
if cutoff < first[0]["time"]:
|
|
continue
|
|
|
|
message = first.pop(0)
|
|
id = (message.get("tags") or {}).get("id")
|
|
|
|
# For each other batch, attempt to find a matching message
|
|
for batch in batches:
|
|
if batch is first:
|
|
continue
|
|
end = message["time"] + message["time_range"]
|
|
merged = None
|
|
for index, candidate in enumerate(batch):
|
|
# optimization: stop when earliest candidate is after message's end time
|
|
if end < candidate['time']:
|
|
break
|
|
if id is None:
|
|
merged = merge_message(message, candidate)
|
|
if merged is not None:
|
|
batch.pop(index)
|
|
break
|
|
elif (candidate.get("tags") or {}).get("id") == id:
|
|
merged = merge_message(message, candidate)
|
|
if merged is None:
|
|
raise ValueError("TODO")
|
|
batch.pop(index)
|
|
break
|
|
if merged is not None:
|
|
message = merged
|
|
|
|
result.append(message)
|
|
batches = [b for b in batches if len(b) > 0]
|
|
|
|
# Once all but one batch is exhausted, the last one must have all remaining messages unmatched.
|
|
# So just append everything.
|
|
if batches:
|
|
result += batches[0]
|
|
|
|
return result
|
|
|
|
|
|
def load(file):
|
|
with open(file) as f:
|
|
return [json.loads(line) for line in f.read().strip().split("\n")]
|
|
|
|
|
|
def main(*files):
|
|
out = False
|
|
if files and files[0] == "--out":
|
|
files = files[1:]
|
|
out = True
|
|
batches = [load(file) for file in files]
|
|
start = time.monotonic()
|
|
result = merge_messages(*batches)
|
|
interval = time.monotonic() - start
|
|
if out:
|
|
print(json.dumps(result))
|
|
else:
|
|
hash = hashlib.sha256(json.dumps(result).encode()).hexdigest()
|
|
print(f"Merged {len(batches)} batches in {interval:.3f}s to hash {hash}")
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
main(*sys.argv[1:])
|