mirror of https://github.com/ekimekim/wubloader
commit
499e486b0b
@ -0,0 +1,62 @@
|
||||
|
||||
import time
|
||||
|
||||
import gevent
|
||||
import requests
|
||||
|
||||
|
||||
class GoogleAPIClient(object):
|
||||
"""Manages access to google apis and maintains an active access token.
|
||||
Make calls using client.request(), which is a wrapper for requests.request().
|
||||
"""
|
||||
|
||||
ACCESS_TOKEN_ERROR_RETRY_INTERVAL = 10
|
||||
# Refresh token 10min before it expires (it normally lasts an hour)
|
||||
ACCESS_TOKEN_REFRESH_TIME_BEFORE_EXPIRY = 600
|
||||
|
||||
def __init__(self, client_id, client_secret, refresh_token):
|
||||
self.client_id = client_id
|
||||
self.client_secret = client_secret
|
||||
self.refresh_token = refresh_token
|
||||
|
||||
self._first_get_access_token = gevent.spawn(self.get_access_token)
|
||||
|
||||
@property
|
||||
def access_token(self):
|
||||
"""Blocks if access token unavailable yet"""
|
||||
self._first_get_access_token.join()
|
||||
return self._access_token
|
||||
|
||||
def get_access_token(self):
|
||||
"""Authenticates against google's API and retrieves a token we will use in
|
||||
subsequent requests.
|
||||
This function gets called automatically when needed, there should be no need to call it
|
||||
yourself."""
|
||||
while True:
|
||||
try:
|
||||
start_time = time.time()
|
||||
resp = requests.post('https://www.googleapis.com/oauth2/v4/token', data={
|
||||
'client_id': self.client_id,
|
||||
'client_secret': self.client_secret,
|
||||
'refresh_token': self.refresh_token,
|
||||
'grant_type': 'refresh_token',
|
||||
})
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
self._access_token = data['access_token']
|
||||
expires_in = (start_time + data['expires_in']) - time.time()
|
||||
if expires_in < self.ACCESS_TOKEN_REFRESH_TIME_BEFORE_EXPIRY:
|
||||
self.logger.warning("Access token expires in {}s, less than normal leeway time of {}s".format(
|
||||
expires_in, self.ACCESS_TOKEN_REFRESH_TIME_BEFORE_EXPIRY,
|
||||
))
|
||||
gevent.spawn_later(expires_in - self.ACCESS_TOKEN_REFRESH_TIME_BEFORE_EXPIRY, self.get_access_token)
|
||||
except Exception:
|
||||
self.logger.exception("Failed to fetch access token, retrying")
|
||||
self.wait(self.ACCESS_TOKEN_ERROR_RETRY_INTERVAL)
|
||||
else:
|
||||
break
|
||||
|
||||
def request(self, method, url, headers={}, **kwargs):
|
||||
# merge in auth header
|
||||
headers = dict(headers, Authorization='Bearer {}'.format(self.access_token))
|
||||
return requests.request(method, url, headers=headers, **kwargs)
|
@ -0,0 +1,16 @@
|
||||
FROM alpine:3.7
|
||||
# dependencies needed for compiling c extensions
|
||||
# also busybox-extras for telnet for easier use of backdoor
|
||||
# and postgresql-dev as a dependency of psycopg2.
|
||||
# Add postgresql-client for easier debugging of DB issues.
|
||||
RUN apk --update add py2-pip gcc python-dev musl-dev busybox-extras postgresql-dev postgresql-client
|
||||
|
||||
# Install common lib first as it changes less
|
||||
COPY common /tmp/common
|
||||
RUN pip install /tmp/common && rm -r /tmp/common
|
||||
|
||||
# Install actual application
|
||||
COPY sheetsync /tmp/sheetsync
|
||||
RUN pip install /tmp/sheetsync && rm -r /tmp/sheetsync
|
||||
|
||||
ENTRYPOINT ["python2", "-m", "sheetsync"]
|
@ -0,0 +1,17 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name = "wubloader-cutter",
|
||||
version = "0.0.0",
|
||||
packages = find_packages(),
|
||||
install_requires = [
|
||||
"argh",
|
||||
"gevent",
|
||||
"prometheus-client",
|
||||
"psycogreen",
|
||||
"psycopg2",
|
||||
"python-dateutil",
|
||||
"requests",
|
||||
"wubloader-common",
|
||||
],
|
||||
)
|
@ -0,0 +1,16 @@
|
||||
|
||||
import gevent.monkey
|
||||
gevent.monkey.patch_all()
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
import argh
|
||||
|
||||
from sheetsync.main import main
|
||||
|
||||
LOG_FORMAT = "[%(asctime)s] %(levelname)8s %(name)s(%(module)s:%(lineno)d): %(message)s"
|
||||
|
||||
level = os.environ.get('WUBLOADER_LOG_LEVEL', 'INFO').upper()
|
||||
logging.basicConfig(level=level, format=LOG_FORMAT)
|
||||
argh.dispatch_command(main)
|
@ -0,0 +1,284 @@
|
||||
|
||||
import json
|
||||
import logging
|
||||
import signal
|
||||
import uuid
|
||||
|
||||
import argh
|
||||
import gevent.backdoor
|
||||
import gevent.event
|
||||
import prometheus_client as prom
|
||||
from psycopg2 import sql
|
||||
from psycopg2.extras import register_uuid
|
||||
|
||||
import common
|
||||
import common.dateutil
|
||||
from common.database import DBManager, query
|
||||
|
||||
from .sheets import Sheets
|
||||
|
||||
|
||||
class SheetSync(object):
|
||||
|
||||
# Time between syncs
|
||||
RETRY_INTERVAL = 5
|
||||
# Time to wait after getting an error
|
||||
ERROR_RETRY_INTERVAL = 10
|
||||
|
||||
def __init__(self, stop, dbmanager, sheets, sheet_id, worksheets, edit_url, bustime_start, allocate_ids=False):
|
||||
self.stop = stop
|
||||
self.conn = dbmanager.get_conn()
|
||||
self.sheets = sheets
|
||||
self.sheet_id = sheet_id
|
||||
self.worksheets = worksheets
|
||||
self.edit_url = edit_url
|
||||
self.bustime_start = bustime_start
|
||||
self.allocate_ids = allocate_ids
|
||||
# Maps DB column names (or general identifier, for non-DB columns) to sheet column indexes.
|
||||
# Hard-coded for now, future work: determine this from column headers in sheet
|
||||
self.column_map = {
|
||||
'event_start': 0,
|
||||
'event_end': 1,
|
||||
'category': 2,
|
||||
'description': 3,
|
||||
'submitter_winner': 4,
|
||||
'poster_moment': 5,
|
||||
'image_links': 6,
|
||||
'marked_for_edit': 7,
|
||||
'notes': 8,
|
||||
'video_link': 10,
|
||||
'state': 11,
|
||||
'edit_link': 12,
|
||||
'error': 13,
|
||||
'id': 14,
|
||||
}
|
||||
# Maps column names to a function that parses that column's value.
|
||||
# Functions take a single arg (the value to parse) and ValueError is
|
||||
# interpreted as None.
|
||||
# Columns missing from this map default to simply using the string value.
|
||||
self.column_parsers = {
|
||||
'event_start': self.parse_bustime,
|
||||
'event_end': self.parse_bustime,
|
||||
'poster_moment': lambda v: v == u'[\u2713]', # check mark
|
||||
'image_links': lambda v: [link.strip() for link in v.split()] if v.strip() else [],
|
||||
'id': lambda v: uuid.UUID(v) if v.strip() else None,
|
||||
}
|
||||
# List of input columns
|
||||
self.input_columns = [
|
||||
'event_start',
|
||||
'event_end',
|
||||
'category',
|
||||
'description',
|
||||
'submitter_winner',
|
||||
'poster_moment',
|
||||
'image_links',
|
||||
'notes',
|
||||
]
|
||||
# List of output columns
|
||||
self.output_columns = [
|
||||
'video_link',
|
||||
'state',
|
||||
'error',
|
||||
]
|
||||
|
||||
def parse_bustime(self, value):
|
||||
"""Convert from HH:MM or HH:MM:SS format to datetime"""
|
||||
bustime = common.parse_bustime(value)
|
||||
return common.bustime_to_dt(self.bustime_start, bustime)
|
||||
|
||||
def wait(self, interval):
|
||||
self.stop.wait(common.jitter(interval))
|
||||
|
||||
def run(self):
|
||||
while not self.stop.is_set():
|
||||
|
||||
try:
|
||||
# Since the full dataset is small, the cost of round tripping to the database to check
|
||||
# each row is more expensive than the cost of just grabbing the entire table
|
||||
# and comparing locally.
|
||||
events = self.get_events()
|
||||
for worksheet in self.worksheets:
|
||||
rows = self.sheets.get_rows(self.sheet_id, worksheet)
|
||||
for row_index, row in enumerate(rows):
|
||||
# Skip first row (ie. the column titles).
|
||||
# Need to do it inside the loop and not eg. use rows[1:],
|
||||
# because then row_index won't be correct.
|
||||
if row_index == 0:
|
||||
continue
|
||||
row = self.parse_row(row)
|
||||
self.sync_row(worksheet, row_index, row, events.get(row['id']))
|
||||
except Exception:
|
||||
logging.exception("Failed to sync")
|
||||
self.wait(self.ERROR_RETRY_INTERVAL)
|
||||
else:
|
||||
logging.info("Successful sync")
|
||||
self.wait(self.RETRY_INTERVAL)
|
||||
|
||||
def get_events(self):
|
||||
"""Return the entire events table as a map {id: event namedtuple}"""
|
||||
result = query(self.conn, "SELECT * FROM events")
|
||||
by_id = {}
|
||||
for row in result.fetchall():
|
||||
by_id[row.id] = row
|
||||
return by_id
|
||||
|
||||
def parse_row(self, row):
|
||||
"""Take a row as a sequence of columns, and return a dict {column: value}"""
|
||||
row_dict = {}
|
||||
for column, index in self.column_map.items():
|
||||
if index >= len(row):
|
||||
# Sheets omits trailing columns if they're all empty, so substitute empty string
|
||||
value = ''
|
||||
else:
|
||||
value = row[index]
|
||||
if column in self.column_parsers:
|
||||
try:
|
||||
value = self.column_parsers[column](value)
|
||||
except ValueError:
|
||||
value = None
|
||||
row_dict[column] = value
|
||||
return row_dict
|
||||
|
||||
def sync_row(self, worksheet, row_index, row, event):
|
||||
"""Take a row dict and an Event from the database (or None if id not found)
|
||||
and take whatever action is required to sync them, ie. writing to the database or sheet."""
|
||||
|
||||
if event is None:
|
||||
# No event currently in DB, if any field is non-empty, then create it.
|
||||
# Otherwise ignore it.
|
||||
if not any(row[col] for col in self.input_columns):
|
||||
return
|
||||
|
||||
# Only generate row when needed (unless it's already there)
|
||||
# Originally we would allocate rows on first sync, but this led to rate limiting issues.
|
||||
if row['id'] is None:
|
||||
if self.allocate_ids:
|
||||
row['id'] = uuid.uuid4()
|
||||
logging.info("Allocating id for row {!r}:{} = {}".format(worksheet, row_index, row['id']))
|
||||
self.sheets.write_value(
|
||||
self.sheet_id, worksheet,
|
||||
row_index, self.column_map['id'],
|
||||
str(row['id']),
|
||||
)
|
||||
else:
|
||||
logging.warning("Row {!r}:{} has no valid id, skipping".format(worksheet, row_index))
|
||||
return
|
||||
|
||||
logging.info("Inserting new event {}".format(row['id']))
|
||||
# Insertion conflict just means that another sheet sync beat us to the insert.
|
||||
# We can ignore it.
|
||||
insert_cols = ['id'] + self.input_columns
|
||||
built_query = sql.SQL("""
|
||||
INSERT INTO events ({})
|
||||
VALUES ({})
|
||||
ON CONFLICT DO NOTHING
|
||||
""").format(
|
||||
sql.SQL(", ").join(sql.Identifier(col) for col in insert_cols),
|
||||
sql.SQL(", ").join(sql.Placeholder(col) for col in insert_cols),
|
||||
)
|
||||
query(self.conn, built_query, **row)
|
||||
return
|
||||
|
||||
# Update database with any changed inputs
|
||||
changed = [col for col in self.input_columns if row[col] != getattr(event, col)]
|
||||
if changed:
|
||||
logging.info("Updating event {} with new value(s) for {}".format(
|
||||
row['id'], ', '.join(changed)
|
||||
))
|
||||
built_query = sql.SQL("""
|
||||
UPDATE events
|
||||
SET {}
|
||||
WHERE id = %(id)s
|
||||
""").format(sql.SQL(", ").join(
|
||||
sql.SQL("{} = {}").format(
|
||||
sql.Identifier(col), sql.Placeholder(col)
|
||||
) for col in changed
|
||||
))
|
||||
query(self.conn, built_query, **row)
|
||||
|
||||
# Update sheet with any changed outputs
|
||||
format_output = lambda v: '' if v is None else v # cast nulls to empty string
|
||||
changed = [col for col in self.output_columns if row[col] != format_output(getattr(event, col))]
|
||||
if changed:
|
||||
logging.info("Updating sheet row {} with new value(s) for {}".format(
|
||||
row['id'], ', '.join(changed)
|
||||
))
|
||||
for col in changed:
|
||||
self.sheets.write_value(
|
||||
self.sheet_id, worksheet,
|
||||
row_index, self.column_map[col],
|
||||
format_output(getattr(event, col)),
|
||||
)
|
||||
|
||||
# Set edit link if marked for editing and start/end set.
|
||||
# This prevents accidents / clicking the wrong row and provides
|
||||
# feedback that sheet sync is still working.
|
||||
# Also clear it if it shouldn't be set.
|
||||
edit_link = self.edit_url.format(row['id']) if row['marked_for_edit'] == '[+] Marked' else ''
|
||||
if row['edit_link'] != edit_link:
|
||||
logging.info("Updating sheet row {} with edit link {}".format(row['id'], edit_link))
|
||||
self.sheets.write_value(
|
||||
self.sheet_id, worksheet,
|
||||
row_index, self.column_map['edit_link'],
|
||||
edit_link,
|
||||
)
|
||||
|
||||
|
||||
@argh.arg('dbconnect', help=
|
||||
"dbconnect should be a postgres connection string, which is either a space-separated "
|
||||
"list of key=value pairs, or a URI like:\n"
|
||||
"\tpostgresql://USER:PASSWORD@HOST/DBNAME?KEY=VALUE"
|
||||
)
|
||||
@argh.arg('sheets-creds-file', help=
|
||||
"sheets_creds_file should be a json file containing keys "
|
||||
"'client_id', 'client_secret' and 'refresh_token'."
|
||||
)
|
||||
@argh.arg('edit-url', help=
|
||||
'edit_url should be a format string for edit links, with {} as a placeholder for id. '
|
||||
'eg. "https://myeditor.example.com/edit/{}" will produce edit urls like '
|
||||
'"https://myeditor.example.com/edit/da6cf4df-4871-4a9a-a660-0b1e1a6a9c10".'
|
||||
)
|
||||
@argh.arg('bustime_start', type=common.dateutil.parse, help=
|
||||
"bustime_start is the timestamp which is bustime 00:00."
|
||||
)
|
||||
@argh.arg('worksheet-names', nargs='+', help=
|
||||
"The names of the individual worksheets within the sheet to operate on."
|
||||
)
|
||||
@argh.arg('--allocate-ids', help=
|
||||
"--allocate-ids means that it will give rows without ids an id. "
|
||||
"Only one sheet sync should have --allocate-ids on for a given sheet at once!"
|
||||
)
|
||||
def main(dbconnect, sheets_creds_file, edit_url, bustime_start, sheet_id, worksheet_names, metrics_port=8005, backdoor_port=0, allocate_ids=False):
|
||||
"""
|
||||
Sheet sync constantly scans a Google Sheets sheet and a database, copying inputs from the sheet
|
||||
to the DB and outputs from the DB to the sheet.
|
||||
|
||||
With the exception of id allocation, all operations are idempotent and multiple sheet syncs
|
||||
may be run for redundancy.
|
||||
"""
|
||||
common.PromLogCountsHandler.install()
|
||||
common.install_stacksampler()
|
||||
prom.start_http_server(metrics_port)
|
||||
|
||||
register_uuid()
|
||||
|
||||
if backdoor_port:
|
||||
gevent.backdoor.BackdoorServer(('127.0.0.1', backdoor_port), locals=locals()).start()
|
||||
|
||||
stop = gevent.event.Event()
|
||||
gevent.signal(signal.SIGTERM, stop.set) # shut down on sigterm
|
||||
|
||||
logging.info("Starting up")
|
||||
|
||||
dbmanager = DBManager(dsn=dbconnect)
|
||||
sheets_creds = json.load(open(sheets_creds_file))
|
||||
|
||||
sheets = Sheets(
|
||||
client_id=sheets_creds['client_id'],
|
||||
client_secret=sheets_creds['client_secret'],
|
||||
refresh_token=sheets_creds['refresh_token'],
|
||||
)
|
||||
|
||||
SheetSync(stop, dbmanager, sheets, sheet_id, worksheet_names, edit_url, bustime_start, allocate_ids).run()
|
||||
|
||||
logging.info("Gracefully stopped")
|
@ -0,0 +1,64 @@
|
||||
|
||||
import logging
|
||||
|
||||
from common.googleapis import GoogleAPIClient
|
||||
|
||||
|
||||
class Sheets(object):
|
||||
"""Manages Google Sheets API operations"""
|
||||
|
||||
def __init__(self, client_id, client_secret, refresh_token):
|
||||
self.logger = logging.getLogger(type(self).__name__)
|
||||
self.client = GoogleAPIClient(client_id, client_secret, refresh_token)
|
||||
|
||||
def get_rows(self, spreadsheet_id, sheet_name, range=None):
|
||||
"""Return a list of rows, where each row is a list of the values of each column.
|
||||
Range optionally restricts returned rows, and uses A1 format, eg. "A1:B5".
|
||||
"""
|
||||
if range:
|
||||
range = "'{}'!{}".format(sheet_name, range)
|
||||
else:
|
||||
range = "'{}'".format(sheet_name)
|
||||
resp = self.client.request('GET',
|
||||
'https://sheets.googleapis.com/v4/spreadsheets/{}/values/{}'.format(
|
||||
spreadsheet_id, range,
|
||||
),
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data['values']
|
||||
|
||||
def write_value(self, spreadsheet_id, sheet_name, row, column, value):
|
||||
"""Write value to the row and column (0-based) given."""
|
||||
range = "'{sheet}'!{col}{row}:{col}{row}".format(
|
||||
sheet = sheet_name,
|
||||
row = row + 1, # 1-indexed rows in range syntax
|
||||
col = self.index_to_column(column),
|
||||
)
|
||||
resp = self.client.request('PUT',
|
||||
'https://sheets.googleapis.com/v4/spreadsheets/{}/values/{}'.format(
|
||||
spreadsheet_id, range,
|
||||
),
|
||||
params={
|
||||
"valueInputOption": "1", # RAW
|
||||
},
|
||||
json={
|
||||
"range": range,
|
||||
"values": [[value]],
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
def index_to_column(self, index):
|
||||
"""For a given column index, convert to a column description, eg. 0 -> A, 1 -> B, 26 -> AA."""
|
||||
# This is equivalent to the 0-based index in base-26 (where A = 0, B = 1, ..., Z = 25)
|
||||
digits = []
|
||||
while index:
|
||||
index, digit = divmod(index, 26)
|
||||
digits.append(digit)
|
||||
# We now have the digits, but they're backwards.
|
||||
digits = digits[::-1]
|
||||
# Now convert the digits to letters
|
||||
digits = [chr(ord('A') + digit) for digit in digits]
|
||||
# Finally, convert to string
|
||||
return ''.join(digits)
|
Loading…
Reference in New Issue