@ -3,6 +3,7 @@
import datetime
import datetime
import errno
import errno
import hashlib
import hashlib
import itertools
import logging
import logging
import os
import os
import random
import random
@ -50,7 +51,7 @@ node_list_errors = prom.Counter(
backfill_errors = prom . Counter (
backfill_errors = prom . Counter (
' backfill_errors ' ,
' backfill_errors ' ,
' Number of errors backfilling ' ,
' Number of errors backfilling ' ,
[ ' remote ' , ' channel ' ],
[ ' remote ' ],
)
)
segments_deleted = prom . Counter (
segments_deleted = prom . Counter (
@ -169,10 +170,10 @@ def get_remote_segment(base_dir, node, channel, quality, hour, missing_segment,
logger . info ( ' Segment {} / {} / {} backfilled ' . format ( quality , hour , missing_segment ) )
logger . info ( ' Segment {} / {} / {} backfilled ' . format ( quality , hour , missing_segment ) )
def list_hours ( node , channel , qualit ies , start = None ) :
def list_hours ( node , channel , qualit y , start = None ) :
""" Return a list of all available hours from a node.
""" Return a list of all available hours from a node.
List all hours available from node / channel for each quality in qualities
List all hours available from node / channel
ordered from newest to oldest .
ordered from newest to oldest .
Keyword arguments :
Keyword arguments :
@ -180,8 +181,7 @@ def list_hours(node, channel, qualities, start=None):
return hours more recent than that number of hours ago . If None ( default ) ,
return hours more recent than that number of hours ago . If None ( default ) ,
all hours are returned . """
all hours are returned . """
hour_lists = [ list_remote_hours ( node , channel , quality ) for quality in qualities ]
hours = list_remote_hours ( node , channel , quality )
hours = list ( set ( ) . union ( * hour_lists ) )
hours . sort ( reverse = True ) #latest hour first
hours . sort ( reverse = True ) #latest hour first
if start is not None :
if start is not None :
@ -204,7 +204,7 @@ class BackfillerManager(object):
NODE_INTERVAL = 300 #seconds between updating list of nodes
NODE_INTERVAL = 300 #seconds between updating list of nodes
def __init__ ( self , base_dir , channel , qualities , static_nodes = [ ] ,
def __init__ ( self , base_dir , channel s , qualities , static_nodes = [ ] ,
start = None , delete_old = False , run_once = False , node_file = None ,
start = None , delete_old = False , run_once = False , node_file = None ,
node_database = None , localhost = None , download_concurrency = 5 ,
node_database = None , localhost = None , download_concurrency = 5 ,
recent_cutoff = 120 ) :
recent_cutoff = 120 ) :
@ -212,7 +212,7 @@ class BackfillerManager(object):
Creates a manager for a given channel with specified qualities . """
Creates a manager for a given channel with specified qualities . """
self . base_dir = base_dir
self . base_dir = base_dir
self . channel = channel
self . channel s = channel s
self . qualities = qualities
self . qualities = qualities
self . static_nodes = static_nodes
self . static_nodes = static_nodes
self . start = start
self . start = start
@ -225,7 +225,7 @@ class BackfillerManager(object):
self . download_concurrency = download_concurrency
self . download_concurrency = download_concurrency
self . recent_cutoff = recent_cutoff
self . recent_cutoff = recent_cutoff
self . stopping = gevent . event . Event ( )
self . stopping = gevent . event . Event ( )
self . logger = logging . getLogger ( " BackfillerManager ({} ) " . format ( channel ) )
self . logger = logging . getLogger ( " BackfillerManager " )
self . workers = { } # {node url: worker}
self . workers = { } # {node url: worker}
def stop ( self ) :
def stop ( self ) :
@ -257,8 +257,8 @@ class BackfillerManager(object):
else :
else :
self . logger . info ( ' Deleting hours older than {} hours ago ' . format ( self . start ) )
self . logger . info ( ' Deleting hours older than {} hours ago ' . format ( self . start ) )
for quality in self . qualities :
for channel, quality in itertools . product ( self . channels , self . qualities ) :
hours = list_local_hours ( self . base_dir , self . channel , quality )
hours = list_local_hours ( self . base_dir , channel , quality )
if not isinstance ( self . start , datetime . datetime ) :
if not isinstance ( self . start , datetime . datetime ) :
cutoff = datetime . datetime . utcnow ( ) - datetime . timedelta ( hours = self . start )
cutoff = datetime . datetime . utcnow ( ) - datetime . timedelta ( hours = self . start )
else :
else :
@ -270,13 +270,13 @@ class BackfillerManager(object):
# deleting segments can take a bit time but is less important
# deleting segments can take a bit time but is less important
# than the actually backfilling so we yield
# than the actually backfilling so we yield
gevent . idle ( )
gevent . idle ( )
path = os . path . join ( self . base_dir , self . channel , quality , hour )
path = os . path . join ( self . base_dir , channel , quality , hour )
self . logger . info ( ' Deleting {} ' . format ( path ) )
self . logger . info ( ' Deleting {} ' . format ( path ) )
segments = list_local_segments ( self . base_dir , self . channel , quality , hour )
segments = list_local_segments ( self . base_dir , channel , quality , hour )
for segment in segments :
for segment in segments :
try :
try :
os . remove ( os . path . join ( path , segment ) )
os . remove ( os . path . join ( path , segment ) )
segments_deleted . labels ( channel = self . channel , quality = quality , hour = hour ) . inc ( )
segments_deleted . labels ( channel = channel , quality = quality , hour = hour ) . inc ( )
except OSError as e :
except OSError as e :
# ignore error when the file is already gone
# ignore error when the file is already gone
if e . errno != errno . ENOENT :
if e . errno != errno . ENOENT :
@ -411,7 +411,7 @@ class BackfillerWorker(object):
self . base_dir = manager . base_dir
self . base_dir = manager . base_dir
self . node = node
self . node = node
self . download_concurrency = manager . download_concurrency
self . download_concurrency = manager . download_concurrency
self . channel = manager . channel
self . channel s = manager . channel s
self . qualities = manager . qualities
self . qualities = manager . qualities
self . start = manager . start
self . start = manager . start
self . run_once = manager . run_once
self . run_once = manager . run_once
@ -420,7 +420,7 @@ class BackfillerWorker(object):
self . done = gevent . event . Event ( )
self . done = gevent . event . Event ( )
def __repr__ ( self ) :
def __repr__ ( self ) :
return ' < {} at 0x {:x} for {!r} /{!r} >' . format ( type ( self ) . __name__ , id ( self ) , self . node , self . channel )
return ' < {} at 0x {:x} for {!r} >' . format ( type ( self ) . __name__ , id ( self ) , self . node )
__str__ = __repr__
__str__ = __repr__
def stop ( self ) :
def stop ( self ) :
@ -428,15 +428,14 @@ class BackfillerWorker(object):
self . logger . info ( ' Stopping ' )
self . logger . info ( ' Stopping ' )
self . stopping . set ( )
self . stopping . set ( )
def backfill ( self , hours ):
def backfill ( self ):
""" Backfill from remote node.
""" Backfill from remote node.
Backfill from node / channel / qualities to base_dir / channel / qualities for
Backfill from node / channel / qualities to base_dir / channel / qualities for
each hour in hours .
each hour in hours .
"""
"""
for quality in self . qualities :
for channel , quality in itertools . product ( self . channels , self . qualities ) :
for hour in list_hours ( self . node , channel , quality , self . start ) :
for hour in hours :
# since backfilling can take a long time, recheck whether this
# since backfilling can take a long time, recheck whether this
# hour is after the start
# hour is after the start
if self . start is not None :
if self . start is not None :
@ -449,8 +448,8 @@ class BackfillerWorker(object):
self . logger . info ( ' Backfilling {} / {} ' . format ( quality , hour ) )
self . logger . info ( ' Backfilling {} / {} ' . format ( quality , hour ) )
local_segments = set ( list_local_segments ( self . base_dir , self . channel , quality , hour ) )
local_segments = set ( list_local_segments ( self . base_dir , channel , quality , hour ) )
remote_segments = set ( list_remote_segments ( self . node , self . channel , quality , hour ) )
remote_segments = set ( list_remote_segments ( self . node , channel , quality , hour ) )
missing_segments = list ( remote_segments - local_segments )
missing_segments = list ( remote_segments - local_segments )
# randomise the order of the segments to reduce the chance that
# randomise the order of the segments to reduce the chance that
@ -465,7 +464,7 @@ class BackfillerWorker(object):
if self . stopping . is_set ( ) :
if self . stopping . is_set ( ) :
return
return
path = os . path . join ( self . channel , quality , hour , missing_segment )
path = os . path . join ( channel , quality , hour , missing_segment )
# test to see if file is a segment and get the segments start time
# test to see if file is a segment and get the segments start time
try :
try :
@ -488,7 +487,7 @@ class BackfillerWorker(object):
# start segment as soon as a pool slot opens up, then track it in workers
# start segment as soon as a pool slot opens up, then track it in workers
workers . append ( pool . spawn (
workers . append ( pool . spawn (
get_remote_segment ,
get_remote_segment ,
self . base_dir , self . node , self . channel , quality , hour , missing_segment , self . logger
self . base_dir , self . node , channel , quality , hour , missing_segment , self . logger
) )
) )
# verify that all the workers succeeded. if any failed, raise the exception from
# verify that all the workers succeeded. if any failed, raise the exception from
@ -497,7 +496,7 @@ class BackfillerWorker(object):
worker . get ( ) # re-raise error, if any
worker . get ( ) # re-raise error, if any
self . logger . info ( ' {} segments in {} / {} backfilled ' . format ( len ( workers ) , quality , hour ) )
self . logger . info ( ' {} segments in {} / {} backfilled ' . format ( len ( workers ) , quality , hour ) )
hours_backfilled . labels ( remote = self . node , channel = self . channel , quality = quality ) . inc ( )
hours_backfilled . labels ( remote = self . node , channel = channel , quality = quality ) . inc ( )
def run ( self ) :
def run ( self ) :
@ -507,7 +506,7 @@ class BackfillerWorker(object):
while not self . stopping . is_set ( ) :
while not self . stopping . is_set ( ) :
try :
try :
self . logger . info ( ' Starting backfill ' )
self . logger . info ( ' Starting backfill ' )
self . backfill ( list_hours ( self . node , self . channel , self . qualities , self . start ) )
self . backfill ( )
self . logger . info ( ' Backfill complete ' )
self . logger . info ( ' Backfill complete ' )
failures = 0 #reset failure count on a successful backfill
failures = 0 #reset failure count on a successful backfill
if not self . run_once :
if not self . run_once :
@ -518,7 +517,7 @@ class BackfillerWorker(object):
failures + = 1
failures + = 1
delay = common . jitter ( TIMEOUT * 2 * * failures )
delay = common . jitter ( TIMEOUT * 2 * * failures )
self . logger . exception ( ' Backfill failed. Retrying in {:.0f} s ' . format ( delay ) )
self . logger . exception ( ' Backfill failed. Retrying in {:.0f} s ' . format ( delay ) )
backfill_errors . labels ( remote = self . node , channel = self . channel ). inc ( )
backfill_errors . labels ( remote = self . node ). inc ( )
self . stopping . wait ( delay )
self . stopping . wait ( delay )
if self . run_once :
if self . run_once :
@ -566,18 +565,12 @@ def main(channels, base_dir='.', qualities='source', metrics_port=8002,
common . install_stacksampler ( )
common . install_stacksampler ( )
prom . start_http_server ( metrics_port )
prom . start_http_server ( metrics_port )
managers = [ ]
logging . info ( ' Starting backfilling {} with {} as qualities to {} ' . format ( ' , ' . join ( channels ) , ' , ' . join ( qualities ) , base_dir ) )
workers = [ ]
manager = BackfillerManager ( base_dir , channels , qualities , static_nodes ,
for channel in channels :
logging . info ( ' Starting backfilling {} with {} as qualities to {} ' . format ( channel , ' , ' . join ( qualities ) , base_dir ) )
manager = BackfillerManager ( base_dir , channel , qualities , static_nodes ,
start , delete_old , run_once , node_file , node_database ,
start , delete_old , run_once , node_file , node_database ,
localhost , download_concurrency , recent_cutoff )
localhost , download_concurrency , recent_cutoff )
managers . append ( manager )
workers . append ( gevent . spawn ( manager . run ) )
def stop ( ) :
def stop ( ) :
for manager in managers :
manager . stop ( )
manager . stop ( )
gevent . signal ( signal . SIGTERM , stop )
gevent . signal ( signal . SIGTERM , stop )
@ -585,19 +578,6 @@ def main(channels, base_dir='.', qualities='source', metrics_port=8002,
if backdoor_port :
if backdoor_port :
gevent . backdoor . BackdoorServer ( ( ' 127.0.0.1 ' , backdoor_port ) , locals = locals ( ) ) . start ( )
gevent . backdoor . BackdoorServer ( ( ' 127.0.0.1 ' , backdoor_port ) , locals = locals ( ) ) . start ( )
# Wait for any to die
manager . run ( )
gevent . wait ( workers , count = 1 )
# If one has stopped, either:
# 1. stop() was called and all are stopping
# 2. one errored and we should stop all remaining and report the error
# Our behaviour in both cases is the same:
# 1. Tell all managers to gracefully stop
stop ( )
# 2. Wait (with timeout) until they've stopped
gevent . wait ( workers )
# 3. Check if any of them failed. If they did, report it. If mulitple
# failed, we report one arbitrarily.
for worker in workers :
worker . get ( )
logging . info ( ' Gracefully stopped ' )
logging . info ( ' Gracefully stopped ' )