Implement reverse sync mode

This is a mode where all data flows one-way from the database to the sheet.
It is intended to be used to populate an empty sheet from database events,
possibly sourced from somewhere else.

To make this work, a few changes were required:
* Track which ids we've seen so we know what events were not matched with a row
* Allow `row` to be None in sync_rows
* When it is, call the middleware to create a new row with a new id
* In sheets, this is implemented by tracking the last empty rows we saw, and claiming them as needed.
pull/401/head
Mike Lang 1 year ago committed by Mike Lang
parent 85de9757f7
commit 74869de89d

@ -69,10 +69,11 @@ class SheetSync(object):
# Time to wait after getting an error # Time to wait after getting an error
ERROR_RETRY_INTERVAL = 10 ERROR_RETRY_INTERVAL = 10
def __init__(self, middleware, stop, dbmanager): def __init__(self, middleware, stop, dbmanager, reverse_sync=False):
self.middleware = middleware self.middleware = middleware
self.stop = stop self.stop = stop
self.dbmanager = dbmanager self.dbmanager = dbmanager
self.create_missing_ids = False
# List of input columns # List of input columns
self.input_columns = [ self.input_columns = [
'event_start', 'event_start',
@ -91,6 +92,14 @@ class SheetSync(object):
'state', 'state',
'error', 'error',
] ]
if reverse_sync:
# Reverse Sync refers to copying all event data from the database into the sheet,
# instead of it (mostly) being the other way. In particular:
# - All columns become output columns
# - We are allowed to create new sheet rows for database events if they don't exist.
self.create_missing_ids = True
self.output_columns += self.input_columns
self.input_columns = []
def run(self): def run(self):
self.conn = self.dbmanager.get_conn() self.conn = self.dbmanager.get_conn()
@ -103,10 +112,18 @@ class SheetSync(object):
# each row is more expensive than the cost of just grabbing the entire table # each row is more expensive than the cost of just grabbing the entire table
# and comparing locally. # and comparing locally.
events = self.get_events() events = self.get_events()
seen = set()
for row in self.middleware.get_rows(): for row in self.middleware.get_rows():
if row['id'] in seen:
logging.error("Duplicate id {}, skipping".format(row['id']))
continue
seen.add(row['id'])
self.sync_row(row, events.get(row['id'])) self.sync_row(row, events.get(row['id']))
for event in [e for id, e in events.items() if id not in seen]:
self.sync_row(event["sheet_name"], None, event)
except Exception as e: except Exception as e:
# for HTTPErrors, http response body includes the more detailed error # for HTTPErrors, http response body includes the more detailed error
detail = '' detail = ''
@ -151,11 +168,14 @@ class SheetSync(object):
return by_id return by_id
def sync_row(self, row, event): def sync_row(self, row, event):
"""Take a row dict and an Event from the database (or None if id not found) """Take a row dict (or None) and an Event from the database (or None)
and take whatever action is required to sync them, ie. writing to the database or sheet.""" and take whatever action is required to sync them, ie. writing to the database or sheet.
worksheet = row["sheet_name"] At least one of row and event must be non-None.
"""
if event is None: if event is None:
assert row
worksheet = row["sheet_name"]
# No event currently in DB, create it. # No event currently in DB, create it.
logging.info("Inserting new event {}".format(row['id'])) logging.info("Inserting new event {}".format(row['id']))
# Insertion conflict just means that another sheet sync beat us to the insert. # Insertion conflict just means that another sheet sync beat us to the insert.
@ -175,6 +195,15 @@ class SheetSync(object):
self.middleware.mark_modified(worksheet) self.middleware.mark_modified(worksheet)
return return
if row is None:
assert event
if not self.create_missing_ids:
logging.info("Skipping event {} without any matching row".format(event["id"]))
return
logging.info("Adding new row {}".format(event["id"]))
row = self.middleware.create_row(event["sheet_name"], event["id"])
worksheet = row["sheet_name"]
rows_found.labels(worksheet).inc() rows_found.labels(worksheet).inc()
# If no database error, but we have parse errors, indicate they should be displayed. # If no database error, but we have parse errors, indicate they should be displayed.
@ -188,7 +217,7 @@ class SheetSync(object):
event = event._replace(state='UNLISTED') event = event._replace(state='UNLISTED')
# Update database with any changed inputs # Update database with any changed inputs
changed = [col for col in self.input_columns if row[col] != getattr(event, col)] changed = [col for col in self.input_columns if row.get(col) != getattr(event, col)]
if changed: if changed:
logging.info("Updating event {} with new value(s) for {}".format( logging.info("Updating event {} with new value(s) for {}".format(
row['id'], ', '.join(changed) row['id'], ', '.join(changed)
@ -208,7 +237,7 @@ class SheetSync(object):
# Update sheet with any changed outputs # Update sheet with any changed outputs
format_output = lambda v: '' if v is None else v # cast nulls to empty string format_output = lambda v: '' if v is None else v # cast nulls to empty string
changed = [col for col in self.output_columns if row[col] != format_output(getattr(event, col))] changed = [col for col in self.output_columns if row.get(col) != format_output(getattr(event, col))]
if changed: if changed:
logging.info("Updating sheet row {} with new value(s) for {}".format( logging.info("Updating sheet row {} with new value(s) for {}".format(
row['id'], ', '.join(changed) row['id'], ', '.join(changed)
@ -324,12 +353,15 @@ class PlaylistSync:
@argh.arg('--playlist-worksheet', help= @argh.arg('--playlist-worksheet', help=
"An optional additional worksheet name that holds playlist tag definitions", "An optional additional worksheet name that holds playlist tag definitions",
) )
def main(dbconnect, sheets_creds_file, edit_url, bustime_start, sheet_id, worksheet_names, metrics_port=8005, backdoor_port=0, allocate_ids=False, playlist_worksheet=None): @argh.arg('--reverse-sync', help=
"Enables an alternate mode where all data is synced from the database to the sheet",
)
def main(dbconnect, sheets_creds_file, edit_url, bustime_start, sheet_id, worksheet_names, metrics_port=8005, backdoor_port=0, allocate_ids=False, playlist_worksheet=None, reverse_sync=False):
""" """
Sheet sync constantly scans a Google Sheets sheet and a database, copying inputs from the sheet Sheet sync constantly scans a Google Sheets sheet and a database, copying inputs from the sheet
to the DB and outputs from the DB to the sheet. to the DB and outputs from the DB to the sheet.
With the exception of id allocation, all operations are idempotent and multiple sheet syncs With the exception of id allocation or reverse sync mode, all operations are idempotent and multiple sheet syncs
may be run for redundancy. may be run for redundancy.
""" """
common.PromLogCountsHandler.install() common.PromLogCountsHandler.install()
@ -369,7 +401,7 @@ def main(dbconnect, sheets_creds_file, edit_url, bustime_start, sheet_id, worksh
sheets_middleware = SheetsMiddleware(sheets_client, sheet_id, worksheet_names, bustime_start, edit_url, allocate_ids) sheets_middleware = SheetsMiddleware(sheets_client, sheet_id, worksheet_names, bustime_start, edit_url, allocate_ids)
workers = [ workers = [
SheetSync(sheets_middleware, stop, dbmanager), SheetSync(sheets_middleware, stop, dbmanager, reverse_sync),
] ]
if playlist_worksheet: if playlist_worksheet:
workers.append(PlaylistSync(stop, dbmanager, sheets_client, sheet_id, playlist_worksheet)) workers.append(PlaylistSync(stop, dbmanager, sheets_client, sheet_id, playlist_worksheet))

@ -155,6 +155,8 @@ class SheetsMiddleware():
def get_rows(self): def get_rows(self):
"""Fetch all rows of worksheet, parsed into a list of dicts.""" """Fetch all rows of worksheet, parsed into a list of dicts."""
# Clear previously seen unassigned rows
self.unassigned_rows = {}
for worksheet in self.pick_worksheets(): for worksheet in self.pick_worksheets():
rows = self.sheets.get_rows(self.sheet_id, worksheet) rows = self.sheets.get_rows(self.sheet_id, worksheet)
for row_index, row in enumerate(rows): for row_index, row in enumerate(rows):
@ -167,10 +169,11 @@ class SheetsMiddleware():
# Handle rows without an allocated id # Handle rows without an allocated id
if row['id'] is None: if row['id'] is None:
# If a row is all empty (including no id), ignore it. # If a row is all empty (including no id), ignore it and mark it down for possible use in create_row().
# Ignore the tags column for this check since it is never non-empty due to implicit tags # Ignore the tags column for this check since it is never non-empty due to implicit tags
# (and even if there's other tags, we don't care if there's nothing else in the row). # (and even if there's other tags, we don't care if there's nothing else in the row).
if not any(row[col] for col in self.input_columns if col != 'tags'): if not any(row[col] for col in self.input_columns if col != 'tags'):
self.unassigned_rows.setdefault(worksheet, []).append(row["index"])
continue continue
# If we can't allocate ids, warn and ignore. # If we can't allocate ids, warn and ignore.
if not self.allocate_ids: if not self.allocate_ids:
@ -179,11 +182,7 @@ class SheetsMiddleware():
# Otherwise, allocate id for a new row. # Otherwise, allocate id for a new row.
row['id'] = str(uuid.uuid4()) row['id'] = str(uuid.uuid4())
logging.info(f"Allocating id for row {worksheet!r}:{row['index']} = {row['id']}") logging.info(f"Allocating id for row {worksheet!r}:{row['index']} = {row['id']}")
self.sheets.write_value( self.write_id(row)
self.sheet_id, worksheet,
row["index"], self.column_map['id'],
str(row['id']),
)
# Set edit link if marked for editing and start/end set. # Set edit link if marked for editing and start/end set.
# This prevents accidents / clicking the wrong row and provides # This prevents accidents / clicking the wrong row and provides
@ -199,6 +198,13 @@ class SheetsMiddleware():
yield row yield row
def write_id(self, row):
self.sheets.write_value(
self.sheet_id, row["sheet_name"],
row["index"], self.column_map['id'],
str(row['id']),
)
def parse_row(self, worksheet, row_index, row): def parse_row(self, worksheet, row_index, row):
"""Take a row as a sequence of columns, and return a dict {column: value}""" """Take a row as a sequence of columns, and return a dict {column: value}"""
row_dict = {'_parse_errors': []} row_dict = {'_parse_errors': []}
@ -247,3 +253,14 @@ class SheetsMiddleware():
"""Mark row as having had a change made, bumping its worksheet to the top of """Mark row as having had a change made, bumping its worksheet to the top of
the most-recently-modified queue.""" the most-recently-modified queue."""
self.worksheets[row["sheet_name"]] = monotonic() self.worksheets[row["sheet_name"]] = monotonic()
def create_row(self, worksheet, id):
index = self.unassigned_rows[worksheet].pop(0)
row = {
"sheet_name": worksheet,
"id": id,
"index": index,
}
logging.info(f"Assigning existing id {row['id']} to empty row {worksheet!r}:{row['index']}")
self.write_id(row)
return row

@ -104,3 +104,6 @@ class StreamLogMiddleware:
def mark_modified(self, row): def mark_modified(self, row):
pass # not a concept we have pass # not a concept we have
def create_row(self, worksheet, id):
raise NotImplementedError

Loading…
Cancel
Save