From eeffeeed1026a267c8980d510ad4343b2d9e83b2 Mon Sep 17 00:00:00 2001
From: Mike Lang <mikelang3000@gmail.com>
Date: Fri, 16 Aug 2024 04:00:05 +1000
Subject: [PATCH] sheetsync: Deal with reverse syncing properly when not all
 events are in the list of worksheets

This is important because archive events should not be reversed.
We only want to create new rows when the row's intended worksheet is in our list of worksheets we sync.
---
 sheetsync/sheetsync/main.py       | 20 +++++++++-----------
 sheetsync/sheetsync/middleware.py | 14 +++++++++++---
 sheetsync/sheetsync/sheets.py     | 11 +++++++++--
 sheetsync/sheetsync/streamlog.py  |  6 +++---
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/sheetsync/sheetsync/main.py b/sheetsync/sheetsync/main.py
index 0d4b889..ed5a80d 100644
--- a/sheetsync/sheetsync/main.py
+++ b/sheetsync/sheetsync/main.py
@@ -112,7 +112,7 @@ class SheetSync(object):
 				db_rows = self.get_db_rows()
 				seen = set()
 
-				is_full, sheet_rows = self.middleware.get_rows()
+				worksheets, sheet_rows = self.middleware.get_rows()
 				for row in sheet_rows:
 					if row['id'] in seen:
 						self.logger.error("Duplicate id {}, skipping".format(row['id']))
@@ -120,16 +120,14 @@ class SheetSync(object):
 					seen.add(row['id'])
 					self.sync_row(row, db_rows.get(row['id']))
 
-				if is_full:
-					# Find rows that were not in the sheet.
-					# Only do this if we did a full sync, otherwise things might be missing
-					# simply because they're in a worksheet we didn't sync.
-					missing = [
-						r for id, r in db_rows.items()
-						if id not in seen
-					]
-					for db_row in missing:
-						self.sync_row(None, db_row)
+				# Find rows that were not in the sheet, that were expected to be in that sheet.
+				missing = [
+					r for id, r in db_rows.items()
+					if id not in seen
+					and self.middleware.row_was_expected(r, worksheets)
+				]
+				for db_row in missing:
+					self.sync_row(None, db_row)
 
 			except Exception as e:
 				# for HTTPErrors, http response body includes the more detailed error
diff --git a/sheetsync/sheetsync/middleware.py b/sheetsync/sheetsync/middleware.py
index bc65303..4797841 100644
--- a/sheetsync/sheetsync/middleware.py
+++ b/sheetsync/sheetsync/middleware.py
@@ -13,12 +13,20 @@ class Middleware:
 				is still required.
 			_parse_errors: A list of error messages encountered when parsing, to be surfaced to the
 				user if possible.
-		In addition to the list of dicts, should return an "is_full" boolean which is True
-		if all rows were fetched or False if only some subset was fetched (eg. for quota management reasons).
-		Returns (is_full, rows).
+		In addition to the list of dicts, should return a list of worksheets fetched from,
+		which is then passed to row_was_expected().
+		Returns (worksheets, rows).
 		"""
 		raise NotImplementedError
 
+	def row_was_expected(self, db_row, worksheets):
+		"""Given a database row and list of worksheets from get_rows(), return whether
+		the given row should have been present in the returned rows, ie. if we expected
+		to find it on one of those worksheets."""
+		# Default to the common case, which is that we always return all data
+		# so the row should always be expected.
+		return True
+
 	def write_value(self, row, key, value):
 		"""Write key=value to the given row. Takes the full row object so any identifying info
 		can be read from it as needed."""
diff --git a/sheetsync/sheetsync/sheets.py b/sheetsync/sheetsync/sheets.py
index 4291b73..ef8733e 100644
--- a/sheetsync/sheetsync/sheets.py
+++ b/sheetsync/sheetsync/sheets.py
@@ -103,8 +103,7 @@ class SheetsMiddleware(Middleware):
 					self.write_id(row)
 
 				all_rows.append(row)
-		is_full = sorted(worksheets) == list(self.worksheets.keys())
-		return is_full, all_rows
+		return worksheets, all_rows
 
 	def row_is_non_empty(self, row):
 		"""Returns True if row is considered to be non-empty and should have an id assigned."""
@@ -218,6 +217,11 @@ class SheetsPlaylistsMiddleware(SheetsMiddleware):
 		"show_in_description": ENCODE_CHECKMARK,
 	}
 
+	def row_was_expected(self, db_row, worksheets):
+		# Database does not record a worksheet for playlists, we assume there's only one
+		# sheet and so it should always be there.
+		return True
+
 	def row_is_non_empty(self, row):
 		return row["tags"] is not None
 
@@ -284,6 +288,9 @@ class SheetsEventsMiddleware(SheetsMiddleware):
 		bustime = common.dt_to_bustime(self.bustime_start, value)
 		return common.format_bustime(bustime, round="minute")
 
+	def row_was_expected(self, db_row, worksheets):
+		return db_row.sheet_name in worksheets
+
 	def row_is_non_empty(self, row):
 		return any(row[col] for col in ["event_start", "description"])
 
diff --git a/sheetsync/sheetsync/streamlog.py b/sheetsync/sheetsync/streamlog.py
index 9db2eb7..928a8f6 100644
--- a/sheetsync/sheetsync/streamlog.py
+++ b/sheetsync/sheetsync/streamlog.py
@@ -71,7 +71,7 @@ class StreamLogPlaylistsMiddleware(Middleware):
 				"first_event_id": None, # TODO missing in StreamLog
 				"last_event_id": None, # TODO missing in StreamLog
 			})
-		return True, rows
+		return None, rows
 
 	# writing intentionally not implemented
 
@@ -124,8 +124,8 @@ class StreamLogEventsMiddleware(Middleware):
 			# Malformed rows can be skipped, represented as a None result
 			if row is not None:
 				all_rows.append(row)
-		# There's no worksheet concept here so we always return a full sync.
-		return True, all_rows
+		# There's no worksheet concept here so just return None for worksheets.
+		return None, all_rows
 
 	def parse_row(self, row):
 		output = {}