From 96181fd875593ea9b65bc0091cb4478f270ce797 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Fri, 16 Aug 2024 03:15:46 +1000 Subject: [PATCH] Support archive sync in sheetsync again --- docker-compose.jsonnet | 15 ++++++++-- sheetsync/sheetsync/main.py | 36 +++++++++++++++++++---- sheetsync/sheetsync/sheets.py | 55 ++++++++++++++++++++++++++--------- 3 files changed, 85 insertions(+), 21 deletions(-) diff --git a/docker-compose.jsonnet b/docker-compose.jsonnet index 0dd43e1..e7b3839 100644 --- a/docker-compose.jsonnet +++ b/docker-compose.jsonnet @@ -446,7 +446,7 @@ backend: "sheets", creds: "/etc/sheet-creds.json", sheet_id: $.sheet_id, - allocate_ids: true, + allocate_ids: ! $.sheet_reverse_sync, reverse_sync: $.sheet_reverse_sync, }, local sync_sheet = [ @@ -462,7 +462,18 @@ type: "playlists", worksheets: [$.playlist_worksheet], }, - ], + ] + (if $.archive_worksheet == null then [] else { + sync_sheet_base + { + name: "sheet-archive", + type: "archive", + worksheets: [$.archive_worksheet], + edit_url: $.edit_url, + bustime_start: $.bustime_start, + // archive is never reverse sync + allocate_ids: true, + reverse_sync: false, + } + }), local sync_streamlog_base = { backend: "streamlog", creds: "/etc/streamlog-token.txt", diff --git a/sheetsync/sheetsync/main.py b/sheetsync/sheetsync/main.py index 12e5e1f..0d4b889 100644 --- a/sheetsync/sheetsync/main.py +++ b/sheetsync/sheetsync/main.py @@ -17,7 +17,7 @@ import common.dateutil from common.database import DBManager, query, get_column_placeholder from common.sheets import Sheets as SheetsClient -from .sheets import SheetsEventsMiddleware, SheetsPlaylistsMiddleware +from .sheets import SheetsEventsMiddleware, SheetsPlaylistsMiddleware, SheetsArchiveMiddleware from .streamlog import StreamLogClient, StreamLogEventsMiddleware, StreamLogPlaylistsMiddleware sheets_synced = prom.Counter( @@ -304,6 +304,25 @@ class EventsSync(SheetSync): super().sync_row(sheet_row, db_row) +class ArchiveSync(EventsSync): + # Archive events are a special case of event with less input columns. + # The other input columns default to empty string in the database. + input_columns = { + 'sheet_name', + 'event_start', + 'event_end', + 'description', + 'notes', + } + output_columns = { + 'state', + 'error', + } + # Slower poll rate than events to avoid using large amounts of quota + retry_interval = 20 + error_retry_interval = 20 + + class PlaylistsSync(SheetSync): # Slower poll rate than events to avoid using large amounts of quota @@ -336,7 +355,7 @@ class PlaylistsSync(SheetSync): Always present: name: A human identifier for this sync operation backend: The data source. One of "sheets" or "streamlog" - type: What kind of data is being synced. One of "events" or "playlists" + type: What kind of data is being synced. One of "events", "playlists" or "archive" When backend is "sheets": creds: path to credentials JSON file containing "client_id", "client_secret" and "refresh_token" sheet_id: The id of the Google Sheet to use @@ -346,7 +365,7 @@ class PlaylistsSync(SheetSync): reverse_sync: Boolean, optional. When true, enables an alternate mode where all data is synced from the database to the sheet. Only one sheetsync acting on the same sheet should have this enabled. - When type is "events": + When type is "events" or "archive": edit_url: a format string for edit links, with {} as a placeholder for id bustime_start: Timestamp string at which bustime is 00:00 When backend is "streamlog": @@ -402,8 +421,12 @@ def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0): refresh_token=creds['refresh_token'], ) allocate_ids = config.get("allocate_ids", False) - if config["type"] == "sheets": - middleware = SheetsEventsMiddleware( + if config["type"] in ("sheets", "archive"): + middleware_cls = { + "sheets": SheetsEventsMiddleware, + "archive": SheetsArchiveMiddleware, + } + middleware = middleware_cls( client, config["sheet_id"], config["worksheets"], @@ -431,6 +454,8 @@ def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0): middleware = StreamLogEventsMiddleware(client) elif config["type"] == "playlists": middleware = StreamLogPlaylistsMiddleware(client) + elif config["type"] == "archive": + raise ValueError("Archive sync is not compatible with streamlog") else: raise ValueError("Unknown type {!r}".format(config["type"])) else: @@ -439,6 +464,7 @@ def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0): sync_class = { "events": EventsSync, "playlists": PlaylistsSync, + "archive": ArchiveSync, }[config["type"]] reverse_sync = config.get("reverse_sync", False) sync = sync_class(config["name"], middleware, stop, dbmanager, reverse_sync) diff --git a/sheetsync/sheetsync/sheets.py b/sheetsync/sheetsync/sheets.py index b9e7a4a..4291b73 100644 --- a/sheetsync/sheetsync/sheets.py +++ b/sheetsync/sheetsync/sheets.py @@ -23,6 +23,9 @@ class SheetsMiddleware(Middleware): # + (100 / RETRY_INTERVAL / SYNCS_PER_INACTIVE_CHECK) * (len(worksheets) - ACTIVE_SHEET_COUNT) # For current values, this is 100/5 * 2 + 100/5/4 * 7 = 75 + # Number of initial rows to ignore as they contain headers + header_rows = 1 + # Maps DB column names (or general identifier, for non-DB columns) to sheet column indexes. # id is required. column_map = { @@ -77,10 +80,10 @@ class SheetsMiddleware(Middleware): for worksheet in worksheets: rows = self.client.get_rows(self.sheet_id, worksheet) for row_index, row in enumerate(rows): - # Skip first row (ie. the column titles). + # Skip first row or rows (ie. the column titles). # Need to do it inside the loop and not eg. use rows[1:], # because then row_index won't be correct. - if row_index == 0: + if row_index < self.header_rows: continue row = self.parse_row(worksheet, row_index, row) @@ -116,7 +119,11 @@ class SheetsMiddleware(Middleware): def parse_row(self, worksheet, row_index, row): """Take a row as a sequence of columns, and return a dict {column: value}""" - row_dict = {'_parse_errors': []} + row_dict = { + "sheet_name": worksheet, + "index": row_index, + '_parse_errors': [], + } for column, index in self.column_map.items(): if index >= len(row): # Sheets omits trailing columns if they're all empty, so substitute empty string @@ -286,20 +293,20 @@ class SheetsEventsMiddleware(SheetsMiddleware): # As a special case, add some implicit tags to the tags column. # We prepend these to make it slightly more consistent for the editor, # ie. it's always DAY, CATEGORY, POSTER_MOMENT, CUSTOM - row_dict['tags'] = ( - [ - row_dict['category'], # category name - worksheet, # sheet name - ] + (['Poster Moment'] if row_dict['poster_moment'] else []) - + row_dict['tags'] - ) + # This is only needed for full events (not the archive sheet), + # so only do it if we had a tags column in the first place. + if 'tags' in row_dict: + row_dict['tags'] = ( + [ + row_dict['category'], # category name + worksheet, # sheet name + ] + (['Poster Moment'] if row_dict['poster_moment'] else []) + + row_dict['tags'] + ) # As a special case, treat an end time of "--" as equal to the start time. if row_dict["event_end"] == "--": row_dict["event_end"] = row_dict["event_start"] - # Always include row index and worksheet - row_dict["index"] = row_index - row_dict["sheet_name"] = worksheet # Set edit link if marked for editing and start/end set. # This prevents accidents / clicking the wrong row and provides @@ -307,10 +314,30 @@ class SheetsEventsMiddleware(SheetsMiddleware): # Also clear it if it shouldn't be set. # We do this here instead of in sync_row() because it's Sheets-specific logic # that doesn't depend on the DB event in any way. - edit_link = self.edit_url.format(row['id']) if row['marked_for_edit'] == '[+] Marked' else '' + edit_link = self.edit_url.format(row['id']) if self.show_edit_url(row) else '' if row['edit_link'] != edit_link: logging.info("Updating sheet row {} with edit link {}".format(row['id'], edit_link)) self.write_value(row, "edit_link", edit_link) self.mark_modified(row) return row_dict + + def show_edit_url(self, row): + return row['marked_for_edit'] == '[+] Marked' + + +class SheetsArchiveMiddleware(SheetsEventsMiddleware): + # Archive sheet is similar to events sheet but is missing some columns. + column_map = { + 'event_start': 0, + 'event_end': 1, + 'description': 2, + 'state': 3, + 'notes': 4, + 'edit_link': 6, + 'error': 7, + 'id': 8, + } + + def show_edit_url(self, row): + return row['event_start'] is not None and row['event_end'] is not None