From fc791e03d452e7907cf45744f928148de707cf80 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Sun, 27 Oct 2019 01:08:12 -0700 Subject: [PATCH 1/4] DBManager: Don't test connection on start This gives the individual services more freedom in how to handle a failing connection. --- common/common/database.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/common/common/database.py b/common/common/database.py index bb34003..8bac136 100644 --- a/common/common/database.py +++ b/common/common/database.py @@ -19,8 +19,7 @@ class DBManager(object): returning them. It has the ability to serve as a primitive connection pool, as getting a - new conn will return existing conns it knows about first, but this mainly - just exists to re-use the initial conn used to test the connection, and you + new conn will return existing conns it knows about first, but you should use a real conn pool for any non-trivial use. Returned conns are set to seralizable isolation level, autocommit, and use @@ -30,9 +29,6 @@ class DBManager(object): self.conns = [] self.connect_timeout = connect_timeout self.connect_kwargs = connect_kwargs - # get a connection to test whether connection is working. - conn = self.get_conn() - self.put_conn(conn) def put_conn(self, conn): self.conns.append(conn) From 0e437566aaf2c042dfc4720cbb6a5f29b4cb7b74 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Sun, 27 Oct 2019 01:10:05 -0700 Subject: [PATCH 2/4] backfiller: Don't crash on DB errors We move all connection handling into get_nodes(). This means that problems connecting won't cause further errors and cause the application to completely crash. In turn, this means that the behaviour if the database goes down becomes "continue backfilling from the nodes we know about" instead of crashing. --- backfiller/backfiller/main.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/backfiller/backfiller/main.py b/backfiller/backfiller/main.py index 77d6954..f70cbc8 100644 --- a/backfiller/backfiller/main.py +++ b/backfiller/backfiller/main.py @@ -189,9 +189,8 @@ class BackfillerManager(object): self.start = start self.run_once = run_once self.node_file = node_file - self.node_database = node_database - if self.node_database is not None: - self.db_manager = database.DBManager(dsn=self.node_database) + self.db_manager = None if node_database is None else database.DBManager(dsn=node_database) + self.connection = None self.localhost = localhost self.download_concurrency = download_concurrency self.recent_cutoff = recent_cutoff @@ -228,8 +227,6 @@ class BackfillerManager(object): get_nodes are stopped. If self.run_once, only call nodes once. Calling stop will exit the loop.""" self.logger.info('Starting') - if self.node_database is not None: - self.connection = self.db_manager.get_conn() failures = 0 while not self.stopping.is_set(): @@ -238,17 +235,12 @@ class BackfillerManager(object): except Exception: # To ensure a fresh slate and clear any DB-related errors, get a new conn on error. # This is heavy-handed but simple and effective. - if self.node_database is not None: - self.connection = self.db_manager.get_conn() + self.connection = None if failures < MAX_BACKOFF: failures += 1 delay = common.jitter(TIMEOUT * 2**failures) self.logger.exception('Getting nodes failed. Retrying in {:.0f} s'.format(delay)) - try: - host = [s.split('=')[-1] for s in self.connection.dsn.split() if 'host' in s][0] - except Exception: - host = '' - node_list_errors.labels(filename=self.node_file, database=host).inc() + node_list_errors.labels(filename=self.node_file).inc() self.stopping.wait(delay) continue exisiting_nodes = set(self.workers.keys()) @@ -299,7 +291,9 @@ class BackfillerManager(object): else: nodes[substrs[0]] = substrs[1] - if self.node_database is not None: + if self.db_manager is not None: + if self.connection is None: + self.connection = self.db_manager.get_conn() host = [s.split('=')[-1] for s in self.connection.dsn.split() if 'host' in s][0] self.logger.info('Fetching list of nodes from {}'.format(host)) results = database.query(self.connection, """ From 17af1c4e891ed7ba2ac4200a589967f3fed5f3d0 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Sun, 27 Oct 2019 01:19:54 -0700 Subject: [PATCH 3/4] cutter, sheetsync: Wait for DB to connect on startup This is a nicer error than crashing in the depths of some error handler (which is what will happen if the DB goes unavailable while they're running), and it's a far more common case (eg. the DB is misconfigured) than having it fail halfway through. Neither of these services can do anything meaningful without the DB, so crashing without it is acceptable behaviour. --- cutter/cutter/main.py | 11 +++++++++-- sheetsync/sheetsync/main.py | 14 +++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/cutter/cutter/main.py b/cutter/cutter/main.py index 549539e..b399724 100644 --- a/cutter/cutter/main.py +++ b/cutter/cutter/main.py @@ -565,13 +565,20 @@ def main( # We want to error if either errors, and shut down if either exits. dbmanager = None stopping = gevent.event.Event() - while dbmanager is None: + dbmanager = DBManager(dsn=dbconnect) + while True: try: - dbmanager = DBManager(dsn=dbconnect) + # Get a test connection so we know the database is up, + # this produces a clearer error in cases where there's a connection problem. + conn = dbmanager.get_conn() except Exception: delay = common.jitter(10) logging.info('Cannot connect to database. Retrying in {:.0f} s'.format(delay)) stop.wait(delay) + else: + # put it back so it gets reused on next get_conn() + dbmanager.put_conn(conn) + break with open(creds_file) as f: credentials = json.load(f) diff --git a/sheetsync/sheetsync/main.py b/sheetsync/sheetsync/main.py index 5ffa076..27b612d 100644 --- a/sheetsync/sheetsync/main.py +++ b/sheetsync/sheetsync/main.py @@ -130,6 +130,8 @@ class SheetSync(object): sync_errors.inc() # To ensure a fresh slate and clear any DB-related errors, get a new conn on error. # This is heavy-handed but simple and effective. + # If we can't re-connect, the program will crash from here, + # then restart and wait until it can connect again. self.conn = self.dbmanager.get_conn() self.wait(self.ERROR_RETRY_INTERVAL) else: @@ -293,14 +295,20 @@ def main(dbconnect, sheets_creds_file, edit_url, bustime_start, sheet_id, worksh logging.info("Starting up") - dbmanager = None - while dbmanager is None: + dbmanager = DBManager(dsn=dbconnect) + while True: try: - dbmanager = DBManager(dsn=dbconnect) + # Get a test connection so we know the database is up, + # this produces a clearer error in cases where there's a connection problem. + conn = dbmanager.get_conn() except Exception: delay = common.jitter(10) logging.info('Cannot connect to database. Retrying in {:.0f} s'.format(delay)) stop.wait(delay) + else: + # put it back so it gets reused on next get_conn() + dbmanager.put_conn(conn) + break sheets_creds = json.load(open(sheets_creds_file)) From a42e7b48f635309eb942ad7c542da1fa683a6bc7 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Sun, 27 Oct 2019 01:23:05 -0700 Subject: [PATCH 4/4] thrimshim: Allow degraded operation even if DB broken Any endpoints that don't need a DB conn will still work fine. Notably, this includes /defaults, which is needed for thrimbletrimmer to work in a non-specific-row mode. --- thrimshim/thrimshim/main.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/thrimshim/thrimshim/main.py b/thrimshim/thrimshim/main.py index cca218b..41d5b2f 100644 --- a/thrimshim/thrimshim/main.py +++ b/thrimshim/thrimshim/main.py @@ -347,14 +347,7 @@ def main(connection_string, default_channel, bustime_start, host='0.0.0.0', port sys.exit() gevent.signal(signal.SIGTERM, stop) - app.db_manager = None - while app.db_manager is None and not stopping.is_set(): - try: - app.db_manager = database.DBManager(dsn=connection_string) - except Exception: - delay = common.jitter(10) - logging.info('Cannot connect to database. Retrying in {:.0f} s'.format(delay)) - stopping.wait(delay) + app.db_manager = database.DBManager(dsn=connection_string) common.PromLogCountsHandler.install() common.install_stacksampler()