reintroduced a start time for the backfiller; more logging

pull/43/head
Chris Usher 6 years ago committed by Christopher Usher
parent 292188ad7c
commit ed58b6e44d

@ -9,6 +9,7 @@ import random
import time import time
import uuid import uuid
import dateutil.parser
import gevent.backdoor import gevent.backdoor
import prometheus_client as prom import prometheus_client as prom
import requests import requests
@ -97,10 +98,10 @@ def get_remote_segment(base_dir, node, stream, variant, hour, missing_segment,
locally, this does not attempt to fetch it.""" locally, this does not attempt to fetch it."""
path = os.path.join(base_dir, stream, variant, hour, missing_segment) path = os.path.join(base_dir, stream, variant, hour, missing_segment)
logging.debug('Getting segment {}'.format(path))
# check to see if file was created since we listed the local segments to # check to see if file was created since we listed the local segments to
# avoid unnecessarily copying # avoid unnecessarily copying
if os.path.exists(path): if os.path.exists(path):
logging.debug('Skipping exisiting segment {}'.format(path))
return return
dir_name = os.path.dirname(path) dir_name = os.path.dirname(path)
@ -110,6 +111,7 @@ def get_remote_segment(base_dir, node, stream, variant, hour, missing_segment,
common.ensure_directory(temp_path) common.ensure_directory(temp_path)
try: try:
logging.debug('Fetching segment {} from {}'.format(path, node))
uri = '{}/segments/{}/{}/{}/{}'.format(node, stream, variant, hour, missing_segment) uri = '{}/segments/{}/{}/{}/{}'.format(node, stream, variant, hour, missing_segment)
resp = requests.get(uri, stream=True, timeout=timeout) resp = requests.get(uri, stream=True, timeout=timeout)
@ -129,7 +131,7 @@ def get_remote_segment(base_dir, node, stream, variant, hour, missing_segment,
segments_backfilled.labels(remote=node, stream=stream, variant=variant, hour=hour).inc() segments_backfilled.labels(remote=node, stream=stream, variant=variant, hour=hour).inc()
def backfill(base_dir, stream, variants, hours=None, nodes=None): def backfill(base_dir, stream, variants, hours=None, nodes=None, start=None):
"""Loop over nodes backfilling from each. """Loop over nodes backfilling from each.
Backfill from node/stream/variants to base_dir/stream/variants for each node Backfill from node/stream/variants to base_dir/stream/variants for each node
@ -157,7 +159,7 @@ def is_iterable(x):
return True return True
def backfill_node(base_dir, node, stream, variants, hours=None, segment_order='random', recent_cutoff=60): def backfill_node(base_dir, node, stream, variants, hours=None, segment_order='random', recent_cutoff=60, start=None):
"""Backfill from remote node. """Backfill from remote node.
Backfill from node/stream/variants to base_dir/stream/variants. Backfill from node/stream/variants to base_dir/stream/variants.
@ -170,9 +172,10 @@ def backfill_node(base_dir, node, stream, variants, hours=None, segment_order='r
If 'forward', sort the segment in ascending order. If 'reverse', sort If 'forward', sort the segment in ascending order. If 'reverse', sort
the segments in descending order. Otherwise, do not change the order of the segments in descending order. Otherwise, do not change the order of
segments. segments.
use start and stop to limit which segments are backfilled.
recent_cutoff -- Skip backfilling segments younger than this number of recent_cutoff -- Skip backfilling segments younger than this number of
seconds to prioritise letting the downloader grab these segments.""" seconds to prioritise letting the downloader grab these segments.
start -- Do not backfill hours starting before this time. If None (default),
all hours are backfilled"""
logging.info('Starting backfilling from {}'.format(node)) logging.info('Starting backfilling from {}'.format(node))
@ -194,6 +197,11 @@ def backfill_node(base_dir, node, stream, variants, hours=None, segment_order='r
for variant in variants: for variant in variants:
for hour in hours: for hour in hours:
hour_time = datetime.datetime.strptime(hour, HOUR_FMT)
if start is not None and hour_time < start:
logging.debug('Skipping {}/{}/{} as before start'.format(stream, variant, hour))
continue
logging.info('Backfilling {}/{}/{}'.format(stream, variant, hour)) logging.info('Backfilling {}/{}/{}'.format(stream, variant, hour))
local_segments = set(list_local_segments(base_dir, stream, variant, hour)) local_segments = set(list_local_segments(base_dir, stream, variant, hour))
@ -221,6 +229,7 @@ def backfill_node(base_dir, node, stream, variants, hours=None, segment_order='r
#to avoid getting in the downloader's way ignore segments less than recent_cutoff old #to avoid getting in the downloader's way ignore segments less than recent_cutoff old
if datetime.datetime.utcnow() - segment.start < datetime.timedelta(seconds=recent_cutoff): if datetime.datetime.utcnow() - segment.start < datetime.timedelta(seconds=recent_cutoff):
logging.debug('Skipping {} as too recent'.format(path))
continue continue
get_remote_segment(base_dir, node, stream, variant, hour, missing_segment) get_remote_segment(base_dir, node, stream, variant, hour, missing_segment)
@ -229,7 +238,7 @@ def backfill_node(base_dir, node, stream, variants, hours=None, segment_order='r
logging.info('Finished backfilling from {}'.format(node)) logging.info('Finished backfilling from {}'.format(node))
def main(base_dir='.', stream='', variants='', fill_wait=5, full_fill_wait=180, sleep_time=1, metrics_port=8002, nodes=None, backdoor_port=0): def main(base_dir='.', stream='', variants='', fill_wait=5, full_fill_wait=180, sleep_time=1, metrics_port=8002, nodes=None, backdoor_port=0, start=None):
"""Prototype backfiller service. """Prototype backfiller service.
Do a backfill of the last 3 hours from stream/variants from all nodes Do a backfill of the last 3 hours from stream/variants from all nodes
@ -245,6 +254,8 @@ def main(base_dir='.', stream='', variants='', fill_wait=5, full_fill_wait=180,
variants = variants.split(',') if variants else [] variants = variants.split(',') if variants else []
if nodes is not None: if nodes is not None:
nodes = nodes.split(',') if nodes else [] nodes = nodes.split(',') if nodes else []
if start is not None:
start = dateutil.parser.parse(start)
common.PromLogCountsHandler.install() common.PromLogCountsHandler.install()
common.install_stacksampler() common.install_stacksampler()
@ -259,9 +270,9 @@ def main(base_dir='.', stream='', variants='', fill_wait=5, full_fill_wait=180,
full_fill_start = fill_start full_fill_start = fill_start
backfill(base_dir, stream, variants, 3, nodes=nodes) backfill(base_dir, stream, variants, 3, nodes=nodes, start=start)
backfill(base_dir, stream, variants, nodes=nodes) backfill(base_dir, stream, variants, nodes=nodes, start=start)
# I'm sure there is a module that does this in a more robust way # I'm sure there is a module that does this in a more robust way
# but I understand this and it gives the behaviour I want # but I understand this and it gives the behaviour I want
@ -271,14 +282,14 @@ def main(base_dir='.', stream='', variants='', fill_wait=5, full_fill_wait=180,
if now - full_fill_start > datetime.timedelta(minutes=full_fill_wait): if now - full_fill_start > datetime.timedelta(minutes=full_fill_wait):
backfill(base_dir, stream, variants, nodes=nodes) backfill(base_dir, stream, variants, nodes=nodes, start=start)
fill_start = now fill_start = now
full_fill_start = fill_start full_fill_start = fill_start
elif now - fill_start > datetime.timedelta(minutes=fill_wait): elif now - fill_start > datetime.timedelta(minutes=fill_wait):
backfill(base_dir, stream, variants, 3, nodes=nodes) backfill(base_dir, stream, variants, 3, nodes=nodes, start=start)
fill_start = now fill_start = now

@ -7,6 +7,7 @@ setup(
install_requires = [ install_requires = [
"argh", "argh",
"gevent", "gevent",
"python-dateutil",
"requests", "requests",
"wubloader-common", "wubloader-common",
], ],

Loading…
Cancel
Save