This will signifigantly increase throughput when downloading
large ranges of segments.
The max concurrency is exposed as a cli arg.
We also slightly modify the logged info, so it reports segments downloaded,
not just number of missing segments (which we might skip downloading for various reasons).
# verify that all the workers succeeded. if any failed, raise the exception from
# one of them arbitrarily.
forworkerinworkers:
worker.get()# re-raise error, if any
self.logger.info('{} segments in {}/{} backfilled'.format(len(workers),variant,hour))
defrun(self):
self.logger.info('Starting')
@ -388,9 +405,11 @@ class BackfillerWorker(object):
@argh.arg('--node-file',help="Name of file listing nodes to backfill from. One node per line in the form NAME URI with whitespace only lines or lines starting with '#' ignored. If None (default) do not get nodes from a file.")
@argh.arg('--node-database',help='Postgres conection string for database to fetch a list of nodes from. Either a space-separated list of key=value pairs, or a URI like: postgresql://USER:PASSWORD@HOST/DBNAME?KEY=VALUE . If None (default) do not get nodes from database.')
@argh.arg('--localhost',help='Name of local machine. Used to prevent backfilling from itself. By default the result of socket.gethostname()')
@argh.arg('--download-concurrency',help='Max number of concurrent segment downloads from a single node. Increasing this number may increase throughput but too high a value can overload the server or cause timeouts.')