From ec1bbad7de65cc3d123d16b9c89429cfc850e883 Mon Sep 17 00:00:00 2001 From: HeNine <> Date: Thu, 16 Sep 2021 16:09:56 +0200 Subject: [PATCH] API done --- .../buscribeapi/{__init.__.py => __init__.py} | 0 buscribe-api/buscribeapi/__main__.py | 12 +++ buscribe-api/buscribeapi/buscribeapi.py | 83 +++++++++++++++---- buscribe-api/buscribeapi/main.py | 60 ++++++++++++++ buscribe-api/setup.py | 1 + buscribe-api/templates/busub.jinja | 2 + buscribe-api/templates/busubs.jinja | 5 ++ buscribe_data.sql | 6 +- 8 files changed, 154 insertions(+), 15 deletions(-) rename buscribe-api/buscribeapi/{__init.__.py => __init__.py} (100%) create mode 100644 buscribe-api/templates/busub.jinja create mode 100644 buscribe-api/templates/busubs.jinja diff --git a/buscribe-api/buscribeapi/__init.__.py b/buscribe-api/buscribeapi/__init__.py similarity index 100% rename from buscribe-api/buscribeapi/__init.__.py rename to buscribe-api/buscribeapi/__init__.py diff --git a/buscribe-api/buscribeapi/__main__.py b/buscribe-api/buscribeapi/__main__.py index e69de29..81502b2 100644 --- a/buscribe-api/buscribeapi/__main__.py +++ b/buscribe-api/buscribeapi/__main__.py @@ -0,0 +1,12 @@ +import logging +import os + +import argh + +from buscribeapi.main import main + +LOG_FORMAT = "[%(asctime)s] %(levelname)8s %(name)s(%(module)s:%(lineno)d): %(message)s" + +level = os.environ.get('WUBLOADER_LOG_LEVEL', 'INFO').upper() +logging.basicConfig(level=level, format=LOG_FORMAT) +argh.dispatch_command(main) diff --git a/buscribe-api/buscribeapi/buscribeapi.py b/buscribe-api/buscribeapi/buscribeapi.py index 202285e..cae0941 100644 --- a/buscribe-api/buscribeapi/buscribeapi.py +++ b/buscribe-api/buscribeapi/buscribeapi.py @@ -1,14 +1,26 @@ +import json +from datetime import timedelta + import flask as flask -from common import dateutil +from common import dateutil, database from dateutil.parser import ParserError -from flask import request +from flask import request, jsonify, Response, render_template app = flask.Flask('buscribe') +@app.template_filter() +def convert_vtt_timedelta(delta: timedelta): + return f'{delta.days * 24 + delta.seconds // 3600:02}:{(delta.seconds % 3600) // 60:02}:{delta.seconds % 60:02}.{delta.microseconds // 1000:03}' + + @app.route('/buscribe/vtt') def get_vtt(): - """Returns WebVTT subtitle file for the period between start_time and end_time.""" + """Returns WebVTT subtitle file for the period between start_time and end_time. + + Times are relative to --bustime-start. + + TODO: Figure out proper offsets.""" try: start_time_string = request.args.get('start_time') start_time = dateutil.parse(start_time_string) @@ -25,23 +37,66 @@ def get_vtt(): except ValueError: return "Missing end time!", 400 + db_conn = app.db_manager.get_conn() + + results = fetch_lines(db_conn, start_time, end_time) + + return Response( + render_template("busubs.jinja", results=results, bustime_start=app.bustime_start, + duration_extend=timedelta(seconds=0.3)), + mimetype="text/vtt" + ) + @app.route('/buscribe/json') def get_json(): """Searches the line database for *query*, with optional start_time and end_time boundaries. - Search is done using PostgreSQL websearch_to_tsquery() (https://www.postgresql.org/docs/13/functions-textsearch.html)""" - start_time_string = request.args.get('start_time') - try: - start_time = dateutil.parse(start_time_string) - except ParserError: - return "Invalid start time!", 400 + Search is done using PostgreSQL websearch_to_tsquery() + (https://www.postgresql.org/docs/13/functions-textsearch.html)""" + + start_time_string = request.args.get('start_time', default=None) + if start_time_string is not None: + try: + start_time = dateutil.parse(start_time_string) + except ParserError: + return "Invalid start time!", 400 + else: + start_time = None end_time_string = request.args.get('end_time', default=None) - try: - end_time = dateutil.parse(end_time_string) - except ParserError: - return "Invalid end time!", 400 + if end_time_string is not None: + try: + end_time = dateutil.parse(end_time_string) + except ParserError: + return "Invalid end time!", 400 + else: + end_time = None # I think websearch_to_tsquery() sanitizes its own input. - query = request.args.get('end_time', default=None) + query = request.args.get('query', default=None) + + db_conn = app.db_manager.get_conn() + + results = fetch_lines(db_conn, start_time, end_time, query) + + return jsonify([{"start_time": row.start_time.isoformat(), + "end_time": row.end_time.isoformat(), + "text": row.transcription_line} for row in results]) + + +def fetch_lines(db_conn, start_time, end_time, query=None): + if query is None: + return database.query(db_conn, "SELECT * FROM buscribe_transcriptions WHERE " + "start_time > %s AND " + "end_time < %s;", + start_time if start_time is not None else '-infinity', + end_time if end_time is not None else 'infinity') + else: + return database.query(db_conn, "SELECT * FROM buscribe_transcriptions WHERE " + "start_time > %s AND " + "end_time < %s AND " + "to_tsvector(transcription_line) @@ websearch_to_tsquery(%s);", + start_time if start_time is not None else '-infinity', + end_time if end_time is not None else 'infinity', + query) diff --git a/buscribe-api/buscribeapi/main.py b/buscribe-api/buscribeapi/main.py index e69de29..47f935f 100644 --- a/buscribe-api/buscribeapi/main.py +++ b/buscribe-api/buscribeapi/main.py @@ -0,0 +1,60 @@ +import logging +import os + +import argh +from common import dateutil +from common.database import DBManager +from dateutil.parser import ParserError +from gevent.pywsgi import WSGIServer + +from buscribeapi.buscribeapi import app + + +def cors(app): + """WSGI middleware that sets CORS headers""" + HEADERS = [ + ("Access-Control-Allow-Credentials", "false"), + ("Access-Control-Allow-Headers", "*"), + ("Access-Control-Allow-Methods", "GET,HEAD"), + ("Access-Control-Allow-Origin", "*"), + ("Access-Control-Max-Age", "86400"), + ] + + def handle(environ, start_response): + def _start_response(status, headers, exc_info=None): + headers += HEADERS + return start_response(status, headers, exc_info) + + return app(environ, _start_response) + + return handle + + +@argh.arg('--host', + help='Address or socket server will listen to. Default is 0.0.0.0 (everything on the local machine).') +@argh.arg('--port', + help='Port server will listen on. Default is 8004.') +@argh.arg('--database', + help='Postgres connection string, which is either a space-separated list of key=value pairs, or a URI like: ' + 'postgresql://USER:PASSWORD@HOST/DBNAME?KEY=VALUE') +@argh.arg('--bustime-start', + help='The start time in UTC for the event, for UTC-Bustime conversion') +def main(database="", host='0.0.0.0', port=8005, bustime_start=None): + + if bustime_start is None: + logging.error("Missing --bustime-start!") + exit(1) + + server = WSGIServer((host, port), cors(app)) + + try: + app.bustime_start = dateutil.parse(bustime_start) + except ParserError: + logging.error("Invalid --bustime-start!") + exit(1) + + app.db_manager = DBManager(dsn=database) + + logging.info('Starting up') + server.serve_forever() + logging.info("Gracefully shut down") diff --git a/buscribe-api/setup.py b/buscribe-api/setup.py index 33ab450..ba92f10 100644 --- a/buscribe-api/setup.py +++ b/buscribe-api/setup.py @@ -7,6 +7,7 @@ setup( install_requires = [ "argh", "psycopg2", + "gevent", "greenlet==0.4.16", "psycogreen", "wubloader-common", diff --git a/buscribe-api/templates/busub.jinja b/buscribe-api/templates/busub.jinja new file mode 100644 index 0000000..51b41be --- /dev/null +++ b/buscribe-api/templates/busub.jinja @@ -0,0 +1,2 @@ +{{ (row.start_time - bustime_start - duration_extend)|convert_vtt_timedelta }} --> {{ (row.end_time - bustime_start + duration_extend)|convert_vtt_timedelta }} +- {{ row.transcription_line }} diff --git a/buscribe-api/templates/busubs.jinja b/buscribe-api/templates/busubs.jinja new file mode 100644 index 0000000..258e009 --- /dev/null +++ b/buscribe-api/templates/busubs.jinja @@ -0,0 +1,5 @@ +WEBVTT + +{% for row in results %} +{% include "busub.jinja" %} +{% endfor %} \ No newline at end of file diff --git a/buscribe_data.sql b/buscribe_data.sql index f3266ed..c97cf73 100644 --- a/buscribe_data.sql +++ b/buscribe_data.sql @@ -11,4 +11,8 @@ CREATE TABLE buscribe_transcriptions ); CREATE INDEX buscribe_transcriptions_idx ON buscribe_transcriptions USING - GIN (to_tsvector('english', transcription_line)); \ No newline at end of file + GIN (to_tsvector('english', transcription_line)); + +-- This might not actually be needed. Check once there is more data. +CREATE INDEX buscribe_start_time_idx ON buscribe_transcriptions (start_time); +CREATE INDEX buscribe_end_time_idx ON buscribe_transcriptions (end_time); \ No newline at end of file