From cda8078f6413afa92d5885985947fe7c0c9a8cd3 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Sun, 3 Nov 2019 09:26:21 -0800 Subject: [PATCH] sheetsync: Only check the most recently changed two sheets most times Only check the other sheets every 4th time (20sec instead of 5sec). This elminiates a huge source of unnessecary reads, which prevents us from going over our API limit. --- sheetsync/sheetsync/main.py | 43 ++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/sheetsync/sheetsync/main.py b/sheetsync/sheetsync/main.py index ad66205..e001eda 100644 --- a/sheetsync/sheetsync/main.py +++ b/sheetsync/sheetsync/main.py @@ -8,9 +8,10 @@ import argh import gevent.backdoor import gevent.event import prometheus_client as prom -from requests import HTTPError +from monotonic import monotonic from psycopg2 import sql from psycopg2.extras import register_uuid +from requests import HTTPError import common import common.dateutil @@ -44,15 +45,30 @@ class SheetSync(object): # Time between syncs RETRY_INTERVAL = 5 + # Time to wait after getting an error ERROR_RETRY_INTERVAL = 10 + # How many syncs of active sheets to do before checking inactive sheets. + # By checking inactive sheets less often, we stay within our API limits. + # For example, 4 syncs per inactive check * 5 seconds between syncs = 20s between inactive checks + SYNCS_PER_INACTIVE_CHECK = 4 + + # How many worksheets to keep "active" based on most recent modify time + ACTIVE_SHEET_COUNT = 2 + + # Expected quota usage per 100s = + # (100 / RETRY_INTERVAL) * ACTIVE_SHEET_COUNT + # + (100 / RETRY_INTERVAL / SYNCS_PER_INACTIVE_CHECK) * (len(worksheets) - ACTIVE_SHEET_COUNT) + # For current values, this is 100/5 * 2 + 100/5/4 * 6 = 70 + def __init__(self, stop, dbmanager, sheets, sheet_id, worksheets, edit_url, bustime_start, allocate_ids=False): self.stop = stop self.dbmanager = dbmanager self.sheets = sheets self.sheet_id = sheet_id - self.worksheets = worksheets + # map {worksheet: last modify time} + self.worksheets = {w: 0 for w in worksheets} self.edit_url = edit_url self.bustime_start = bustime_start self.allocate_ids = allocate_ids @@ -116,6 +132,9 @@ class SheetSync(object): def run(self): self.conn = self.dbmanager.get_conn() + # tracks when to do inactive checks + sync_count = 0 + while not self.stop.is_set(): try: @@ -123,7 +142,16 @@ class SheetSync(object): # each row is more expensive than the cost of just grabbing the entire table # and comparing locally. events = self.get_events() - for worksheet in self.worksheets: + if sync_count % self.SYNCS_PER_INACTIVE_CHECK == 0: + # check all worksheets + worksheets = self.worksheets + else: + # only check most recently changed worksheets + worksheets = sorted( + self.worksheets.keys(), key=lambda k: self.worksheets[k], reverse=True, + )[:self.ACTIVE_SHEET_COUNT] + sync_count += 1 + for worksheet in worksheets: rows = self.sheets.get_rows(self.sheet_id, worksheet) for row_index, row in enumerate(rows): # Skip first row (ie. the column titles). @@ -216,6 +244,7 @@ class SheetSync(object): query(self.conn, built_query, sheet_name=worksheet, **row) rows_found.labels(worksheet).inc() rows_changed.labels('insert', worksheet).inc() + self.mark_modified(worksheet) return rows_found.labels(worksheet).inc() @@ -237,6 +266,7 @@ class SheetSync(object): )) query(self.conn, built_query, **row) rows_changed.labels('input', worksheet).inc() + self.mark_modified(worksheet) # Update sheet with any changed outputs format_output = lambda v: '' if v is None else v # cast nulls to empty string @@ -252,6 +282,7 @@ class SheetSync(object): format_output(getattr(event, col)), ) rows_changed.labels('output', worksheet).inc() + self.mark_modified(worksheet) # Set edit link if marked for editing and start/end set. # This prevents accidents / clicking the wrong row and provides @@ -265,6 +296,12 @@ class SheetSync(object): row_index, self.column_map['edit_link'], edit_link, ) + self.mark_modified(worksheet) + + def mark_modified(self, worksheet): + """Mark worksheet as having had a change made, bumping it to the top of + the most-recently-modified queue.""" + self.worksheets[worksheet] = monotonic() @argh.arg('dbconnect', help=