From 96181fd875593ea9b65bc0091cb4478f270ce797 Mon Sep 17 00:00:00 2001
From: Mike Lang <mikelang3000@gmail.com>
Date: Fri, 16 Aug 2024 03:15:46 +1000
Subject: [PATCH] Support archive sync in sheetsync again

---
 docker-compose.jsonnet        | 15 ++++++++--
 sheetsync/sheetsync/main.py   | 36 +++++++++++++++++++----
 sheetsync/sheetsync/sheets.py | 55 ++++++++++++++++++++++++++---------
 3 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/docker-compose.jsonnet b/docker-compose.jsonnet
index 0dd43e1..e7b3839 100644
--- a/docker-compose.jsonnet
+++ b/docker-compose.jsonnet
@@ -446,7 +446,7 @@
         backend: "sheets",
         creds: "/etc/sheet-creds.json",
         sheet_id: $.sheet_id,
-        allocate_ids: true,
+        allocate_ids: ! $.sheet_reverse_sync,
         reverse_sync: $.sheet_reverse_sync,
       },
       local sync_sheet = [
@@ -462,7 +462,18 @@
           type: "playlists",
           worksheets: [$.playlist_worksheet],
         },
-      ],
+      ] + (if $.archive_worksheet == null then [] else {
+        sync_sheet_base + {
+          name: "sheet-archive",
+          type: "archive",
+          worksheets: [$.archive_worksheet],
+          edit_url: $.edit_url,
+          bustime_start: $.bustime_start,
+          // archive is never reverse sync
+          allocate_ids: true,
+          reverse_sync: false,
+        }
+      }),
       local sync_streamlog_base = {
         backend: "streamlog",
         creds: "/etc/streamlog-token.txt",
diff --git a/sheetsync/sheetsync/main.py b/sheetsync/sheetsync/main.py
index 12e5e1f..0d4b889 100644
--- a/sheetsync/sheetsync/main.py
+++ b/sheetsync/sheetsync/main.py
@@ -17,7 +17,7 @@ import common.dateutil
 from common.database import DBManager, query, get_column_placeholder
 from common.sheets import Sheets as SheetsClient
 
-from .sheets import SheetsEventsMiddleware, SheetsPlaylistsMiddleware
+from .sheets import SheetsEventsMiddleware, SheetsPlaylistsMiddleware, SheetsArchiveMiddleware
 from .streamlog import StreamLogClient, StreamLogEventsMiddleware, StreamLogPlaylistsMiddleware
 
 sheets_synced = prom.Counter(
@@ -304,6 +304,25 @@ class EventsSync(SheetSync):
 		super().sync_row(sheet_row, db_row)
 
 
+class ArchiveSync(EventsSync):
+	# Archive events are a special case of event with less input columns.
+	# The other input columns default to empty string in the database.
+	input_columns = {
+		'sheet_name',
+		'event_start',
+		'event_end',
+		'description',
+		'notes',
+	}
+	output_columns = {
+		'state',
+		'error',
+	}
+	# Slower poll rate than events to avoid using large amounts of quota
+	retry_interval = 20
+	error_retry_interval = 20
+
+
 class PlaylistsSync(SheetSync):
 
 	# Slower poll rate than events to avoid using large amounts of quota
@@ -336,7 +355,7 @@ class PlaylistsSync(SheetSync):
 		Always present:
 			name: A human identifier for this sync operation
 			backend: The data source. One of "sheets" or "streamlog"
-			type: What kind of data is being synced. One of "events" or "playlists"
+			type: What kind of data is being synced. One of "events", "playlists" or "archive"
 		When backend is "sheets":
 			creds: path to credentials JSON file containing "client_id", "client_secret" and "refresh_token"
 			sheet_id: The id of the Google Sheet to use
@@ -346,7 +365,7 @@ class PlaylistsSync(SheetSync):
 			reverse_sync: Boolean, optional. When true, enables an alternate mode
 				where all data is synced from the database to the sheet.
 				Only one sheetsync acting on the same sheet should have this enabled.
-			When type is "events":
+			When type is "events" or "archive":
 				edit_url: a format string for edit links, with {} as a placeholder for id
 				bustime_start: Timestamp string at which bustime is 00:00
 		When backend is "streamlog":
@@ -402,8 +421,12 @@ def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0):
 				refresh_token=creds['refresh_token'],
 			)
 			allocate_ids = config.get("allocate_ids", False)
-			if config["type"] == "sheets":
-				middleware = SheetsEventsMiddleware(
+			if config["type"] in ("sheets", "archive"):
+				middleware_cls = {
+					"sheets": SheetsEventsMiddleware,
+					"archive": SheetsArchiveMiddleware,
+				}
+				middleware = middleware_cls(
 					client,
 					config["sheet_id"],
 					config["worksheets"],
@@ -431,6 +454,8 @@ def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0):
 				middleware = StreamLogEventsMiddleware(client)
 			elif config["type"] == "playlists":
 				middleware = StreamLogPlaylistsMiddleware(client)
+			elif config["type"] == "archive":
+				raise ValueError("Archive sync is not compatible with streamlog")
 			else:
 				raise ValueError("Unknown type {!r}".format(config["type"]))
 		else:
@@ -439,6 +464,7 @@ def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0):
 		sync_class = {
 			"events": EventsSync,
 			"playlists": PlaylistsSync,
+			"archive": ArchiveSync,
 		}[config["type"]]
 		reverse_sync = config.get("reverse_sync", False)
 		sync = sync_class(config["name"], middleware, stop, dbmanager, reverse_sync)
diff --git a/sheetsync/sheetsync/sheets.py b/sheetsync/sheetsync/sheets.py
index b9e7a4a..4291b73 100644
--- a/sheetsync/sheetsync/sheets.py
+++ b/sheetsync/sheetsync/sheets.py
@@ -23,6 +23,9 @@ class SheetsMiddleware(Middleware):
 	#  + (100 / RETRY_INTERVAL / SYNCS_PER_INACTIVE_CHECK) * (len(worksheets) - ACTIVE_SHEET_COUNT)
 	# For current values, this is 100/5 * 2 + 100/5/4 * 7 = 75
 
+	# Number of initial rows to ignore as they contain headers
+	header_rows = 1
+
 	# Maps DB column names (or general identifier, for non-DB columns) to sheet column indexes.
 	# id is required.
 	column_map = {
@@ -77,10 +80,10 @@ class SheetsMiddleware(Middleware):
 		for worksheet in worksheets:
 			rows = self.client.get_rows(self.sheet_id, worksheet)
 			for row_index, row in enumerate(rows):
-				# Skip first row (ie. the column titles).
+				# Skip first row or rows (ie. the column titles).
 				# Need to do it inside the loop and not eg. use rows[1:],
 				# because then row_index won't be correct.
-				if row_index == 0:
+				if row_index < self.header_rows:
 					continue
 				row = self.parse_row(worksheet, row_index, row)
 
@@ -116,7 +119,11 @@ class SheetsMiddleware(Middleware):
 
 	def parse_row(self, worksheet, row_index, row):
 		"""Take a row as a sequence of columns, and return a dict {column: value}"""
-		row_dict = {'_parse_errors': []}
+		row_dict = {
+			"sheet_name": worksheet,
+			"index": row_index,
+			'_parse_errors': [],
+		}
 		for column, index in self.column_map.items():
 			if index >= len(row):
 				# Sheets omits trailing columns if they're all empty, so substitute empty string
@@ -286,20 +293,20 @@ class SheetsEventsMiddleware(SheetsMiddleware):
 		# As a special case, add some implicit tags to the tags column.
 		# We prepend these to make it slightly more consistent for the editor,
 		# ie. it's always DAY, CATEGORY, POSTER_MOMENT, CUSTOM
-		row_dict['tags'] = (
-			[
-				row_dict['category'], # category name
-				worksheet, # sheet name
-			] + (['Poster Moment'] if row_dict['poster_moment'] else [])
-			+ row_dict['tags']
-		)
+		# This is only needed for full events (not the archive sheet),
+		# so only do it if we had a tags column in the first place.
+		if 'tags' in row_dict:
+			row_dict['tags'] = (
+				[
+					row_dict['category'], # category name
+					worksheet, # sheet name
+				] + (['Poster Moment'] if row_dict['poster_moment'] else [])
+				+ row_dict['tags']
+			)
 
 		# As a special case, treat an end time of "--" as equal to the start time.
 		if row_dict["event_end"] == "--":
 			row_dict["event_end"] = row_dict["event_start"]
-		# Always include row index and worksheet
-		row_dict["index"] = row_index
-		row_dict["sheet_name"] = worksheet
 
 		# Set edit link if marked for editing and start/end set.
 		# This prevents accidents / clicking the wrong row and provides
@@ -307,10 +314,30 @@ class SheetsEventsMiddleware(SheetsMiddleware):
 		# Also clear it if it shouldn't be set.
 		# We do this here instead of in sync_row() because it's Sheets-specific logic
 		# that doesn't depend on the DB event in any way.
-		edit_link = self.edit_url.format(row['id']) if row['marked_for_edit'] == '[+] Marked' else ''
+		edit_link = self.edit_url.format(row['id']) if self.show_edit_url(row) else ''
 		if row['edit_link'] != edit_link:
 			logging.info("Updating sheet row {} with edit link {}".format(row['id'], edit_link))
 			self.write_value(row, "edit_link", edit_link)
 			self.mark_modified(row)
 
 		return row_dict
+
+	def show_edit_url(self, row):
+		return row['marked_for_edit'] == '[+] Marked'
+
+
+class SheetsArchiveMiddleware(SheetsEventsMiddleware):
+	# Archive sheet is similar to events sheet but is missing some columns.
+	column_map = {
+		'event_start': 0,
+		'event_end': 1,
+		'description': 2,
+		'state': 3,
+		'notes': 4,
+		'edit_link': 6,
+		'error': 7,
+		'id': 8,
+	}
+
+	def show_edit_url(self, row):
+		return row['event_start'] is not None and row['event_end'] is not None