Merge pull request #143 from ekimekim/mike/sheetsync/api-usage

sheetsync improvements
6 years ago · f452aa3c32
parent dfe9d61428 1c0f3a627b
commit f452aa3c32
1 changed files with 60 additions and 11 deletions
--- a/sheetsync/sheetsync/main.py
+++ b/sheetsync/sheetsync/main.py
@ -8,9 +8,10 @@ import argh
 import gevent.backdoor
 import gevent.event
 import prometheus_client as prom
-from requests import HTTPError
+from monotonic import monotonic
 from psycopg2 import sql
 from psycopg2.extras import register_uuid
 from requests import HTTPError
 import common
 import common.dateutil
@ -44,15 +45,30 @@ class SheetSync(object):
 	# Time between syncs
 	RETRY_INTERVAL = 5
 	# Time to wait after getting an error
 	ERROR_RETRY_INTERVAL = 10
 	# How many syncs of active sheets to do before checking inactive sheets.
 	# By checking inactive sheets less often, we stay within our API limits.
 	# For example, 4 syncs per inactive check * 5 seconds between syncs = 20s between inactive checks
 	SYNCS_PER_INACTIVE_CHECK = 4
 	# How many worksheets to keep "active" based on most recent modify time
 	ACTIVE_SHEET_COUNT = 2
 	# Expected quota usage per 100s =
 	#  (100 / RETRY_INTERVAL) * ACTIVE_SHEET_COUNT
 	#  + (100 / RETRY_INTERVAL / SYNCS_PER_INACTIVE_CHECK) * (len(worksheets) - ACTIVE_SHEET_COUNT)
 	# For current values, this is 100/5 * 2 + 100/5/4 * 6 = 70
 	def __init__(self, stop, dbmanager, sheets, sheet_id, worksheets, edit_url, bustime_start, allocate_ids=False):
 		self.stop = stop
 		self.dbmanager = dbmanager
 		self.sheets = sheets
 		self.sheet_id = sheet_id
-		self.worksheets = worksheets
+		# map {worksheet: last modify time}
 		self.worksheets = {w: 0 for w in worksheets}
 		self.edit_url = edit_url
 		self.bustime_start = bustime_start
 		self.allocate_ids = allocate_ids
@ -110,12 +126,19 @@ class SheetSync(object):
 		bustime = common.parse_bustime(value)
 		return common.bustime_to_dt(self.bustime_start, bustime)
-	def wait(self, interval):
+	def wait(self, base, interval):
-		self.stop.wait(common.jitter(interval))
+		"""Wait until INTERVAL seconds after BASE."""
 		now = monotonic()
 		to_wait = base + common.jitter(interval) - now
 		if to_wait > 0:
 			self.stop.wait(to_wait)
 	def run(self):
 		self.conn = self.dbmanager.get_conn()
 		# tracks when to do inactive checks
 		sync_count = 0
 		while not self.stop.is_set():
 			try:
@ -123,7 +146,19 @@ class SheetSync(object):
 				# each row is more expensive than the cost of just grabbing the entire table
 				# and comparing locally.
 				events = self.get_events()
-				for worksheet in self.worksheets:
+				if sync_count % self.SYNCS_PER_INACTIVE_CHECK == 0:
 					# check all worksheets
 					worksheets = self.worksheets
 				else:
 					# only check most recently changed worksheets
 					worksheets = sorted(
 						self.worksheets.keys(), key=lambda k: self.worksheets[k], reverse=True,
 					)[:self.ACTIVE_SHEET_COUNT]
 				sync_count += 1
 				sync_start = monotonic()
 				for worksheet in worksheets:
 					rows = self.sheets.get_rows(self.sheet_id, worksheet)
 					for row_index, row in enumerate(rows):
 						# Skip first row (ie. the column titles).
@ -145,11 +180,11 @@ class SheetSync(object):
 				# If we can't re-connect, the program will crash from here,
 				# then restart and wait until it can connect again.
 				self.conn = self.dbmanager.get_conn()
-				self.wait(self.ERROR_RETRY_INTERVAL)
+				self.wait(sync_start, self.ERROR_RETRY_INTERVAL)
 			else:
-				logging.info("Successful sync")
+				logging.info("Successful sync of worksheets: {}".format(", ".join(worksheets)))
 				sheets_synced.inc()
-				self.wait(self.RETRY_INTERVAL)
+				self.wait(sync_start, self.RETRY_INTERVAL)
 	def get_events(self):
 		"""Return the entire events table as a map {id: event namedtuple}"""
@ -161,7 +196,7 @@ class SheetSync(object):
 	def parse_row(self, row):
 		"""Take a row as a sequence of columns, and return a dict {column: value}"""
-		row_dict = {}
+		row_dict = {'_parse_errors': []}
 		for column, index in self.column_map.items():
 			if index >= len(row):
 				# Sheets omits trailing columns if they're all empty, so substitute empty string
@ -171,8 +206,9 @@ class SheetSync(object):
 			if column in self.column_parsers:
 				try:
 					value = self.column_parsers[column](value)
-				except ValueError:
+				except ValueError as e:
 					value = None
 					row_dict['_parse_errors'].append("Failed to parse column {}: {}".format(column, e))
 			row_dict[column] = value
 		return row_dict
@ -216,10 +252,15 @@ class SheetSync(object):
 			query(self.conn, built_query, sheet_name=worksheet, **row)
 			rows_found.labels(worksheet).inc()
 			rows_changed.labels('insert', worksheet).inc()
 			self.mark_modified(worksheet)
 			return
 		rows_found.labels(worksheet).inc()
 		# If no database error, but we have parse errors, indicate they should be displayed.
 		if event.error is None and row['_parse_errors']:
 			event = event._replace(error=", ".join(row['_parse_errors']))
 		# Update database with any changed inputs
 		changed = [col for col in self.input_columns if row[col] != getattr(event, col)]
 		if changed:
@ -237,6 +278,7 @@ class SheetSync(object):
 			))
 			query(self.conn, built_query, **row)
 			rows_changed.labels('input', worksheet).inc()
 			self.mark_modified(worksheet)
 		# Update sheet with any changed outputs
 		format_output = lambda v: '' if v is None else v # cast nulls to empty string
@ -252,6 +294,7 @@ class SheetSync(object):
 					format_output(getattr(event, col)),
 				)
 			rows_changed.labels('output', worksheet).inc()
 			self.mark_modified(worksheet)
 		# Set edit link if marked for editing and start/end set.
 		# This prevents accidents / clicking the wrong row and provides
@ -265,6 +308,12 @@ class SheetSync(object):
 				row_index, self.column_map['edit_link'],
 				edit_link,
 			)
 			self.mark_modified(worksheet)
 	def mark_modified(self, worksheet):
 		"""Mark worksheet as having had a change made, bumping it to the top of
 		the most-recently-modified queue."""
 		self.worksheets[worksheet] = monotonic()
@argh.arg('dbconnect', help=