You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
wubloader/thrimshim/thrimshim/main.py

536 lines
19 KiB
Python

import datetime
from functools import wraps
import json
import logging
import re
import argh
import base64
import binascii
import flask
import gevent
import gevent.backdoor
from gevent.pywsgi import WSGIServer
import prometheus_client
import psycopg2
from psycopg2 import sql
import common
from common import database
from common.flask_stats import request_stats, after_request
import google.oauth2.id_token
import google.auth.transport.requests
psycopg2.extras.register_uuid()
app = flask.Flask('thrimshim')
app.after_request(after_request)
MAX_TITLE_LENGTH = 100 # Youtube only allows 100-character titles
MAX_DESCRIPTION_LENGTH = 5000 # Youtube only allows 5000-character descriptions
DESCRIPTION_PLAYLISTS_HEADER = "This video is part of the following playlists:"
def cors(app):
"""WSGI middleware that sets CORS headers"""
HEADERS = [
("Access-Control-Allow-Credentials", "false"),
("Access-Control-Allow-Headers", "*"),
("Access-Control-Allow-Methods", "GET,POST,HEAD"),
("Access-Control-Allow-Origin", "*"),
("Access-Control-Max-Age", "86400"),
]
def handle(environ, start_response):
def _start_response(status, headers, exc_info=None):
headers += HEADERS
return start_response(status, headers, exc_info)
return app(environ, _start_response)
return handle
def authenticate(f):
"""Authenticate a token against the database.
Reference: https://developers.google.com/identity/sign-in/web/backend-auth"""
@wraps(f)
def auth_wrapper(*args, **kwargs):
if app.no_authentication:
return f(*args, editor='NOT_AUTH', **kwargs)
try:
userToken = flask.request.json['token']
except (KeyError, TypeError):
return 'User token required', 401
# check whether token is valid
try:
idinfo = google.oauth2.id_token.verify_oauth2_token(userToken, google.auth.transport.requests.Request(), None)
if idinfo['iss'] not in ['accounts.google.com', 'https://accounts.google.com']:
raise ValueError('Wrong issuer.')
except ValueError:
return 'Invalid token. Access denied.', 403
# check whether user is in the database
email = idinfo['email'].lower()
conn = app.db_manager.get_conn()
results = database.query(conn, """
SELECT email
FROM editors
WHERE lower(email) = %s""", email)
row = results.fetchone()
if row is None:
return 'Unknown user. Access denied.', 403
return f(*args, editor=email, **kwargs)
return auth_wrapper
@app.route('/thrimshim/auth-test', methods=['POST'])
@request_stats
@authenticate
def test(editor=None):
return json.dumps(editor)
# To make nginx proxying simpler, we want to allow /metrics/* to work
@app.route('/metrics/<trailing>')
@request_stats
def metrics_with_trailing(trailing):
"""Expose Prometheus metrics."""
return prometheus_client.generate_latest()
@app.route('/metrics')
@request_stats
def metrics():
"""Expose Prometheus metrics."""
return prometheus_client.generate_latest()
@app.route('/thrimshim')
@request_stats
def get_all_rows():
"""Gets all rows from the events table from the database"""
conn = app.db_manager.get_conn()
results = database.query(conn, """
SELECT *
FROM events
ORDER BY event_start
""")
rows = []
for row in results:
row = row._asdict()
row['id'] = str(row['id'])
row = {
key: (
value.isoformat() if isinstance(value, datetime.datetime)
else value
) for key, value in row.items()
}
rows.append(row)
logging.info('All rows fetched')
return json.dumps(rows)
@app.route('/thrimshim/defaults')
@request_stats
def get_defaults():
"""Get default info needed by thrimbletrimmer when not loading a specific row."""
return json.dumps({
"video_channel": app.default_channel,
"bustime_start": app.bustime_start,
"title_prefix": app.title_header,
"title_max_length": MAX_TITLE_LENGTH - len(app.title_header),
"upload_locations": app.upload_locations,
})
@app.route('/thrimshim/<uuid:ident>', methods=['GET'])
@request_stats
def get_row(ident):
"""Gets the row from the database with id == ident."""
conn = app.db_manager.get_conn()
results = database.query(conn, """
SELECT *
FROM events
WHERE id = %s
""", ident)
row = results.fetchone()
if row is None:
return 'Row id = {} not found'.format(ident), 404
assert row.id == ident
response = row._asdict()
response['id'] = str(response['id'])
if response["video_channel"] is None:
response["video_channel"] = app.default_channel
response["title_prefix"] = app.title_header
response["title_max_length"] = MAX_TITLE_LENGTH - len(app.title_header)
response["bustime_start"] = app.bustime_start
response["upload_locations"] = app.upload_locations
# pick default thumbnail template based on start time.
# pick default frame time as the middle of the video.
# ignore both if video has no start time yet.
DEFAULT_TEMPLATES = [
"zeta",
"dawn-guard",
"alpha-flight",
"night-watch",
]
if response['event_start'] is not None:
start = response['event_start']
if response['thumbnail_template'] is None:
# RDPs default to the RDP template. Others use the current shift.
if response['category'] == "RDP":
response['thumbnail_template'] = "rdp"
else:
pst_hour = (start.hour - 8) % 24
shift = int(pst_hour / 6)
response['thumbnail_template'] = DEFAULT_TEMPLATES[shift]
if response['thumbnail_time'] is None:
if response['event_end'] is not None:
# take full duration, and add half to start to get halfway
duration = response['event_end'] - start
response['thumbnail_time'] = start + duration / 2
else:
# no end time, just use start time as default frame
response['thumbnail_time'] = start
# remove any added headers or footers so round-tripping is a no-op
if (
app.title_header
and response["video_title"] is not None
and response["video_title"].startswith(app.title_header)
):
response["video_title"] = response["video_title"][len(app.title_header):]
description_playlist_re = re.compile(r"\n\n({}\n(- .* \[https://youtube.com/playlist\?list=[A-Za-z0-9_-]+\]\n)+\n)?{}$".format(
re.escape(DESCRIPTION_PLAYLISTS_HEADER),
re.escape(app.description_footer),
))
if response["video_description"] is not None:
match = description_playlist_re.search(response["video_description"])
if match:
response["video_description"] = response["video_description"][:match.start()]
logging.info('Row {} fetched'.format(ident))
def convert(value):
if isinstance(value, datetime.datetime):
return value.isoformat()
if isinstance(value, datetime.timedelta):
return value.total_seconds()
if isinstance(value, memoryview) or isinstance(value, bytes):
return base64.b64encode(bytes(value)).decode()
raise TypeError(f"Can't convert object of type {value.__class__.__name__} to JSON: {value}")
return json.dumps(response, default=convert)
@app.route('/thrimshim/<uuid:ident>', methods=['POST'])
@request_stats
@authenticate
def update_row(ident, editor=None):
"""Updates row of database with id = ident with the edit columns in new_row."""
new_row = flask.request.json
override_changes = new_row.get('override_changes', False)
state_columns = ['state', 'uploader', 'error', 'video_link']
# These have to be set before a video can be set as 'EDITED'
non_null_columns = [
'upload_location', 'video_ranges', 'video_transitions',
'video_channel', 'video_quality', 'video_title',
'video_description', 'video_tags', 'thumbnail_mode', 'public'
]
edit_columns = non_null_columns + [
'allow_holes', 'uploader_whitelist', 'thumbnail_time', 'thumbnail_template', 'thumbnail_image'
]
sheet_columns = [
'sheet_name', 'event_start', 'event_end',
'category', 'description', 'notes', 'tags',
]
# These columns may be modified when a video is in state 'DONE',
# and are a subset of edit_columns.
modifiable_columns = [
'video_title', 'video_description', 'video_tags', 'public',
'thumbnail_mode', 'thumbnail_time', 'thumbnail_template', 'thumbnail_image',
]
assert set(modifiable_columns) - set(edit_columns) == set()
# Check vital edit columns are in new_row
wanted = set(non_null_columns + ['state'] + sheet_columns)
missing = wanted - set(new_row)
if missing:
return 'Fields missing in JSON: {}'.format(', '.join(missing)), 400
# Get rid of irrelevant columns
extras = set(new_row) - set(edit_columns + state_columns + sheet_columns)
for extra in extras:
del new_row[extra]
# Check a row with id = ident is in the database
conn = app.db_manager.get_conn()
built_query = sql.SQL("""
SELECT id, state, {}
FROM events
WHERE id = %s
""").format(sql.SQL(', ').join(
sql.Identifier(key) for key in sheet_columns
))
results = database.query(conn, built_query, ident)
old_row = results.fetchone()._asdict()
if old_row is None:
return 'Row {} not found'.format(ident), 404
assert old_row['id'] == ident
playlists = database.query(conn, """
SELECT playlist_id, name, tags
FROM playlists
WHERE show_in_description
""")
# Filter for matching playlists for this video
playlists = [
playlist for playlist in playlists
if all(
tag.lower() in [t.lower() for t in old_row['tags']]
for tag in playlist.tags
)
]
# Include headers and footers
new_row['video_title'] = app.title_header + new_row['video_title']
description_lines = []
if playlists:
# NOTE: If you change this format, you need to also change the regex that matches this
# on the GET handler.
description_lines.append(DESCRIPTION_PLAYLISTS_HEADER)
description_lines += [
"- {} [https://youtube.com/playlist?list={}]".format(playlist.name, playlist.playlist_id)
for playlist in playlists
]
description_lines.append('') # blank line before footer
description_lines.append(app.description_footer)
new_row['video_description'] += "\n\n" + "\n".join(description_lines)
# Validate youtube requirements on title and description
if len(new_row['video_title']) > MAX_TITLE_LENGTH:
return 'Title must be {} characters or less, including prefix'.format(MAX_TITLE_LENGTH), 400
if len(new_row['video_description']) > MAX_DESCRIPTION_LENGTH:
return 'Description must be {} characters or less, including footer'.format(MAX_DESCRIPTION_LENGTH), 400
for char in ['<', '>']:
if char in new_row['video_title']:
return 'Title may not contain a {} character'.format(char), 400
if char in new_row['video_description']:
return 'Description may not contain a {} character'.format(char), 400
# Validate and convert video ranges and transitions.
num_ranges = len(new_row['video_ranges'])
if num_ranges == 0:
return 'Ranges must contain at least one range', 400
if len(new_row['video_transitions']) != num_ranges - 1:
return 'There must be exactly {} transitions for {} ranges'.format(
num_ranges - 1, num_ranges,
)
for start, end in new_row['video_ranges']:
if start > end:
return 'Range start must be less than end', 400
# We need these to be tuples not lists for psycopg2 to do the right thing,
# but since they come in as JSON they are currently lists.
new_row['video_ranges'] = [tuple(range) for range in new_row['video_ranges']]
new_row['video_transitions'] = [
None if transition is None else tuple(transition)
for transition in new_row['video_transitions']
]
# Convert binary fields from base64 and do basic validation of contents
if new_row.get('thumbnail_image') is not None:
if new_row['thumbnail_mode'] != 'CUSTOM':
return 'Can only upload custom image when thumbnail_mode = "CUSTOM"', 400
try:
new_row['thumbnail_image'] = base64.b64decode(new_row['thumbnail_image'])
except binascii.Error:
return 'thumbnail_image must be valid base64', 400
# check for PNG file header
if not new_row['thumbnail_image'].startswith(b'\x89PNG\r\n\x1a\n'):
return 'thumbnail_image must be a PNG', 400
if new_row['state'] == 'MODIFIED':
if old_row['state'] not in ['DONE', 'MODIFIED']:
return 'Video is in state {} and cannot be modified'.format(old_row['state']), 403
elif old_row['state'] not in ['UNEDITED', 'EDITED', 'CLAIMED']:
return 'Video already published', 403
# check whether row has been changed in the sheet since editing has begun
changes = ''
for column in sheet_columns:
if isinstance(old_row[column], datetime.datetime):
old_row[column] = old_row[column].isoformat()
def normalize(value):
if isinstance(value, list):
return sorted(map(normalize, value))
if value is None:
return None
return value.lower().strip()
if normalize(new_row[column]) != normalize(old_row[column]):
changes += '{}: {} => {}\n'.format(column, new_row[column], old_row[column])
if changes and not override_changes:
return 'Sheet columns have changed since editing has begun. Please review changes\n' + changes, 409
if new_row['state'] == 'MODIFIED':
# Modifying published rows is more limited, we ignore all other fields.
for column in set(modifiable_columns) & set(non_null_columns):
if new_row.get(column) is None:
missing.append(column)
if missing:
return 'Fields {} must be non-null for modified video'.format(', '.join(missing)), 400
build_query = sql.SQL("""
UPDATE events
SET last_modified = NOW(), error = NULL, state = 'MODIFIED', {}
WHERE id = %(id)s AND state IN ('DONE', 'MODIFIED')
""").format(sql.SQL(", ").join(
sql.SQL("{} = {}").format(
sql.Identifier(column), database.get_column_placeholder(column),
) for column in set(modifiable_columns) & set(new_row)
))
result = database.query(conn, build_query, id=ident, **new_row)
if result.rowcount != 1:
return 'Video changed state while we were updating - maybe it was reset?', 403
else:
# handle state columns
if new_row['state'] == 'EDITED':
missing = []
for column in non_null_columns:
if new_row[column] is None:
missing.append(column)
if missing:
return 'Fields {} must be non-null for video to be cut'.format(', '.join(missing)), 400
if len(new_row.get('video_title', '')) <= len(app.title_header):
return 'Video title must not be blank', 400
elif new_row['state'] != 'UNEDITED':
return 'Invalid state {}'.format(new_row['state']), 400
new_row['uploader'] = None
new_row['error'] = None
new_row['editor'] = editor
new_row['edit_time'] = datetime.datetime.utcnow()
# actually update database
build_query = sql.SQL("""
UPDATE events
SET {}
WHERE id = %(id)s
AND state IN ('UNEDITED', 'EDITED', 'CLAIMED')"""
).format(sql.SQL(", ").join(
sql.SQL("{} = {}").format(
sql.Identifier(column), database.get_column_placeholder(column),
) for column in new_row.keys() if column not in sheet_columns
))
result = database.query(conn, build_query, id=ident, **new_row)
if result.rowcount != 1:
return 'Video likely already published', 403
logging.info('Row {} updated to state {}'.format(ident, new_row['state']))
return ''
@app.route('/thrimshim/manual-link/<uuid:ident>', methods=['POST'])
@request_stats
@authenticate
def manual_link(ident, editor=None):
"""Manually set a video_link if the state is 'UNEDITED' or 'DONE' and the
upload_location is 'manual' or 'youtube-manual'."""
link = flask.request.json['link']
upload_location = flask.request.json.get('upload_location', 'manual')
if upload_location == 'youtube-manual':
YOUTUBE_URL_RE = r'^https?://(?:youtu\.be/|youtube.com/watch\?v=)([a-zA-Z0-9_-]{11})$'
match = re.match(YOUTUBE_URL_RE, link)
if not match:
return 'Link does not appear to be a youtube.com or youtu.be video link. Try removing any extra query params (after the video id).', 400
video_id, = match.groups()
elif upload_location == 'manual':
video_id = None
else:
return 'Upload location must be "manual" or "youtube-manual"', 400
conn = app.db_manager.get_conn()
results = database.query(conn, """
SELECT id, state
FROM events
WHERE id = %s""", ident)
old_row = results.fetchone()
if old_row is None:
return 'Row {} not found'.format(ident), 404
if old_row.state != 'UNEDITED':
return 'Invalid state {} for manual video link'.format(old_row.state), 403
now = datetime.datetime.utcnow()
# note we force thumbnail mode of manual uploads to always be NONE,
# since they might not be a video we actually control at all, or might not even be on youtube.
results = database.query(conn, """
UPDATE events
SET state='DONE', upload_location = %s, video_link = %s, video_id = %s,
editor = %s, edit_time = %s, upload_time = %s, thumbnail_mode = 'NONE'
WHERE id = %s AND state = 'UNEDITED'
""", upload_location, link, video_id, editor, now, now, ident)
logging.info("Row {} video_link set to {}".format(ident, link))
return ''
@app.route('/thrimshim/reset/<uuid:ident>', methods=['POST'])
@request_stats
@authenticate
def reset_row(ident, editor=None):
"""Clear state and video_link columns and reset state to 'UNEDITED'.
If force is 'true', it will do so regardless of current state.
Otherwise, it will only do so if we know no video has been uploaded
(state is UNEDITED, EDITED or CLAIMED)
"""
force = (flask.request.args.get('force', '').lower() == "true")
conn = app.db_manager.get_conn()
query = """
UPDATE events
SET state='UNEDITED', error = NULL, video_id = NULL, video_link = NULL,
uploader = NULL, editor = NULL, edit_time = NULL, upload_time = NULL,
last_modified = NULL
WHERE id = %s {}
""".format(
"" if force else "AND state IN ('UNEDITED', 'EDITED', 'CLAIMED')",
)
results = database.query(conn, query, ident)
if results.rowcount != 1:
return 'Row id = {} not found or not in cancellable state'.format(ident), 404
logging.info("Row {} reset to 'UNEDITED'".format(ident))
return ''
@argh.arg('--host', help='Address or socket server will listen to. Default is 0.0.0.0 (everything on the local machine).')
@argh.arg('--port', help='Port server will listen on. Default is 8004.')
@argh.arg('connection-string', help='Postgres connection string, which is either a space-separated list of key=value pairs, or a URI like: postgresql://USER:PASSWORD@HOST/DBNAME?KEY=VALUE')
@argh.arg('default-channel', help='The default video_channel sent to the editor and assumed if not given on write')
@argh.arg('bustime-start', help='The start time in UTC for the event, for UTC-Bustime conversion')
@argh.arg('--backdoor-port', help='Port for gevent.backdoor access. By default disabled.')
@argh.arg('--no-authentication', help='Bypass authentication (act as though all calls are authenticated)')
@argh.arg('--title-header', help='A header to prefix all titles with, seperated from the submitted title by " - "')
@argh.arg('--description-footer', help='A footer to suffix all descriptions with, seperated from the submitted description by a blank line.')
@argh.arg('--upload-locations', help='A comma-seperated list of valid upload locations, to pass to thrimbletrimmer. The first is the default. Note this is NOT validated on write.')
def main(
connection_string, default_channel, bustime_start, host='0.0.0.0', port=8004, backdoor_port=0,
no_authentication=False, title_header=None, description_footer=None, upload_locations='',
):
server = WSGIServer((host, port), cors(app))
app.no_authentication = no_authentication
app.default_channel = default_channel
app.bustime_start = bustime_start
app.title_header = "" if title_header is None else "{} - ".format(title_header)
app.description_footer = "" if description_footer is None else description_footer
app.upload_locations = upload_locations.split(',') if upload_locations else []
app.db_manager = database.DBManager(dsn=connection_string)
common.PromLogCountsHandler.install()
common.install_stacksampler()
if backdoor_port:
gevent.backdoor.BackdoorServer(('127.0.0.1', backdoor_port), locals=locals()).start()
if app.no_authentication:
logging.warning('Not authenticating POST requests')
common.serve_with_graceful_shutdown(server)