You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
wubloader/buscribe-api/buscribeapi/buscribeapi.py

143 lines
5.1 KiB
Python

import json
from datetime import timedelta
import flask as flask
import common
from common import dateutil, database
from dateutil.parser import ParserError
from flask import request, jsonify, Response, render_template
app = flask.Flask('buscribe')
@app.template_filter()
def convert_vtt_timedelta(delta: timedelta):
"""Converts a timedelta to a VTT compatible format."""
return f'{delta.days * 24 + delta.seconds // 3600:02}:{(delta.seconds % 3600) // 60:02}:{delta.seconds % 60:02}.{delta.microseconds // 1000:03}'
@app.template_filter()
def create_seconds_timedelta(seconds):
"""Converts a float of seconds to a timedelta."""
return timedelta(seconds=seconds)
def round_bus_time(delta: timedelta):
"""Round bus time down to the second."""
return f'{delta.days * 24 + delta.seconds // 3600:02}:{(delta.seconds % 3600) // 60:02}:{delta.seconds % 60:02}'
@app.route('/buscribe/vtt')
def get_vtt():
"""Returns WebVTT subtitle file for the period between start_time and end_time.
Times are relative to --bustime-start.
TODO: Figure out proper offsets."""
try:
start_time_string = request.args.get('start_time')
start_time = dateutil.parse(start_time_string)
except ParserError:
return "Invalid start time!", 400
except KeyError:
return "Missing start time!", 400
try:
end_time_string = request.args['end_time']
end_time = dateutil.parse(end_time_string)
except ParserError:
return "Invalid end time!", 400
except KeyError:
return "Missing end time!", 400
db_conn = app.db_manager.get_conn()
segments = common.get_best_segments(app.segments_dir,
start_time,
end_time)
segments_start_time = segments[0].start
results = fetch_lines(db_conn, start_time, end_time)
return Response(
render_template("busubs.jinja", results=results, start_time=segments_start_time,
duration_extend=timedelta(seconds=0.3)),
mimetype="text/vtt"
)
@app.route('/buscribe/json')
def get_json():
"""Searches the line database for *query*, with optional start_time and end_time boundaries.
Search is done using PostgreSQL websearch_to_tsquery()
(https://www.postgresql.org/docs/13/functions-textsearch.html)"""
start_time_string = request.args.get('start_time')
if start_time_string is not None:
try:
start_time = dateutil.parse(start_time_string)
except ParserError:
return "Invalid start time!", 400
else:
start_time = None
end_time_string = request.args.get('end_time')
if end_time_string is not None:
try:
end_time = dateutil.parse(end_time_string)
except ParserError:
return "Invalid end time!", 400
else:
end_time = None
# I think websearch_to_tsquery() sanitizes its own input.
query = request.args.get('query', default=None)
limit = request.args.get('limit', default=None, type=int)
offset = request.args.get('offset', default=None, type=int)
db_conn = app.db_manager.get_conn()
results = fetch_lines(db_conn, start_time, end_time, query, limit, offset)
return jsonify([{"id": row.id,
"start_time": row.start_time.isoformat(),
"start_bus_time": round_bus_time(row.start_time - app.bustime_start),
"end_time": row.end_time.isoformat(),
"end_bus_time": round_bus_time(row.start_time - app.bustime_start),
"verifier": row.verifier,
"speakers": row.names,
"text": row.highlighted_text if row.highlighted_text is not None else ""} for row in results])
def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None):
query = "SELECT *" + \
(
",ts_headline(transcription_line, convert_query(%(text_query)s), 'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text" if ts_query is not None else ",transcription_line AS highlighted_text") + \
" FROM buscribe_all_transcriptions WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s "
if ts_query is not None:
query += "AND (coalesce(transcription_line_ts, ''::tsvector) || coalesce(names_ts, ''::tsvector)) @@ " \
"convert_query(%(text_query)s) " \
"ORDER BY ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) || coalesce(names_ts, ''::tsvector), convert_query(%(text_query)s)) DESC, " \
"start_time "
else:
query += "ORDER BY start_time "
if limit is not None:
query += "LIMIT %(limit)s "
if offset is not None:
query += "OFFSET %(limit)s "
query += ";"
return database.query(db_conn, query,
start_time=start_time if start_time is not None else '-infinity',
end_time=end_time if end_time is not None else 'infinity',
text_query=ts_query,
limit=limit,
offset=offset
)