backfiller: Don't crash on DB errors

We move all connection handling into get_nodes().
This means that problems connecting won't cause further errors
and cause the application to completely crash.

In turn, this means that the behaviour if the database goes down becomes
"continue backfilling from the nodes we know about" instead of crashing.
pull/119/head
Mike Lang 5 years ago
parent fc791e03d4
commit 0e437566aa

@ -189,9 +189,8 @@ class BackfillerManager(object):
self.start = start self.start = start
self.run_once = run_once self.run_once = run_once
self.node_file = node_file self.node_file = node_file
self.node_database = node_database self.db_manager = None if node_database is None else database.DBManager(dsn=node_database)
if self.node_database is not None: self.connection = None
self.db_manager = database.DBManager(dsn=self.node_database)
self.localhost = localhost self.localhost = localhost
self.download_concurrency = download_concurrency self.download_concurrency = download_concurrency
self.recent_cutoff = recent_cutoff self.recent_cutoff = recent_cutoff
@ -228,8 +227,6 @@ class BackfillerManager(object):
get_nodes are stopped. If self.run_once, only call nodes once. Calling get_nodes are stopped. If self.run_once, only call nodes once. Calling
stop will exit the loop.""" stop will exit the loop."""
self.logger.info('Starting') self.logger.info('Starting')
if self.node_database is not None:
self.connection = self.db_manager.get_conn()
failures = 0 failures = 0
while not self.stopping.is_set(): while not self.stopping.is_set():
@ -238,17 +235,12 @@ class BackfillerManager(object):
except Exception: except Exception:
# To ensure a fresh slate and clear any DB-related errors, get a new conn on error. # To ensure a fresh slate and clear any DB-related errors, get a new conn on error.
# This is heavy-handed but simple and effective. # This is heavy-handed but simple and effective.
if self.node_database is not None: self.connection = None
self.connection = self.db_manager.get_conn()
if failures < MAX_BACKOFF: if failures < MAX_BACKOFF:
failures += 1 failures += 1
delay = common.jitter(TIMEOUT * 2**failures) delay = common.jitter(TIMEOUT * 2**failures)
self.logger.exception('Getting nodes failed. Retrying in {:.0f} s'.format(delay)) self.logger.exception('Getting nodes failed. Retrying in {:.0f} s'.format(delay))
try: node_list_errors.labels(filename=self.node_file).inc()
host = [s.split('=')[-1] for s in self.connection.dsn.split() if 'host' in s][0]
except Exception:
host = ''
node_list_errors.labels(filename=self.node_file, database=host).inc()
self.stopping.wait(delay) self.stopping.wait(delay)
continue continue
exisiting_nodes = set(self.workers.keys()) exisiting_nodes = set(self.workers.keys())
@ -299,7 +291,9 @@ class BackfillerManager(object):
else: else:
nodes[substrs[0]] = substrs[1] nodes[substrs[0]] = substrs[1]
if self.node_database is not None: if self.db_manager is not None:
if self.connection is None:
self.connection = self.db_manager.get_conn()
host = [s.split('=')[-1] for s in self.connection.dsn.split() if 'host' in s][0] host = [s.split('=')[-1] for s in self.connection.dsn.split() if 'host' in s][0]
self.logger.info('Fetching list of nodes from {}'.format(host)) self.logger.info('Fetching list of nodes from {}'.format(host))
results = database.query(self.connection, """ results = database.query(self.connection, """

Loading…
Cancel
Save