You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
buscribe/buscribe-api/buscribeapi/buscribeapi.py

172 lines
6.9 KiB
Python

from datetime import timedelta
import common
import flask as flask
from common import dateutil, database, format_bustime, dt_to_bustime, bustime_to_dt, parse_bustime
from dateutil.parser import ParserError
from flask import request, jsonify, Response, render_template
app = flask.Flask('buscribe')
@app.template_filter()
def convert_vtt_timedelta(delta: timedelta):
"""Converts a timedelta to a VTT compatible format."""
return f'{delta.days * 24 + delta.seconds // 3600:02}:{(delta.seconds % 3600) // 60:02}:{delta.seconds % 60:02}.{delta.microseconds // 1000:03}'
@app.template_filter()
def create_seconds_timedelta(seconds):
"""Converts a float of seconds to a timedelta."""
return timedelta(seconds=seconds)
@app.route('/buscribe/vtt')
def get_vtt():
"""Returns WebVTT subtitle file for the period between start_time and end_time.
Times are relative to --bustime-start.
TODO: Figure out proper offsets."""
try:
start_time_string = request.args.get('start_time')
start_time = dateutil.parse(start_time_string)
except ParserError:
return "Invalid start time!", 400
except KeyError:
return "Missing start time!", 400
try:
end_time_string = request.args['end_time']
end_time = dateutil.parse(end_time_string)
except ParserError:
return "Invalid end time!", 400
except KeyError:
return "Missing end time!", 400
db_conn = app.db_manager.get_conn()
segments = common.get_best_segments(app.segments_dir,
start_time,
end_time)
segments_start_time = segments[0].start
results = fetch_lines(db_conn, start_time, end_time)
return Response(
render_template("busubs.jinja", results=results, start_time=segments_start_time,
duration_extend=timedelta(seconds=0.3)),
mimetype="text/vtt"
)
@app.route('/buscribe/json')
def get_json():
"""Searches the line database for *query*, with optional start_time and end_time boundaries.
Search is done using PostgreSQL websearch_to_tsquery()
(https://www.postgresql.org/docs/13/functions-textsearch.html)"""
start_time_string = request.args.get('start_time')
bus_start_time_string = request.args.get('bus_start_time')
if start_time_string is not None:
try:
start_time = dateutil.parse(start_time_string)
except ParserError:
return "Invalid start time!", 400
elif bus_start_time_string is not None:
try:
start_time = bustime_to_dt(app.bustime_start, parse_bustime(bus_start_time_string))
except ValueError:
return "Invalid bus end time!", 400
else:
start_time = None
end_time_string = request.args.get('end_time')
bus_end_time_string = request.args.get('bus_end_time')
if end_time_string is not None:
try:
end_time = dateutil.parse(end_time_string)
except ParserError:
return "Invalid end time!", 400
elif bus_end_time_string is not None:
try:
end_time = bustime_to_dt(app.bustime_start, parse_bustime(bus_end_time_string))
except ValueError:
return "Invalid bus end time!", 400
else:
end_time = None
# I think websearch_to_tsquery() sanitizes its own input.
query = request.args.get('query', default=None)
limit = request.args.get('limit', default=None, type=int)
offset = request.args.get('offset', default=None, type=int)
db_conn = app.db_manager.get_conn()
results = fetch_lines(db_conn, start_time, end_time, query, limit, offset)
return jsonify([{"id": row.id,
"start_time": row.start_time.isoformat(),
"start_bus_time": format_bustime(dt_to_bustime(app.bustime_start, row.start_time), "second"),
"end_time": row.end_time.isoformat(),
"end_bus_time": format_bustime(dt_to_bustime(app.bustime_start, row.end_time), "second"),
"verifier": row.verifier,
"speakers": row.names,
"text": row.highlighted_text if row.highlighted_text is not None else ""} for row in results])
def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None):
query = "SELECT *" + \
(
",ts_headline(transcription_line, convert_query(%(text_query)s), 'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text" if ts_query is not None else ",transcription_line AS highlighted_text") + \
" FROM buscribe_all_transcriptions WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s "
if ts_query is not None:
query += "AND (coalesce(transcription_line_ts, ''::tsvector) || coalesce(names_ts, ''::tsvector)) @@ " \
"convert_query(%(text_query)s) " \
"ORDER BY ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) || coalesce(names_ts, ''::tsvector), convert_query(%(text_query)s)) DESC, " \
"start_time "
else:
query += "ORDER BY start_time "
if limit is not None:
query += "LIMIT %(limit)s "
if offset is not None:
query += "OFFSET %(limit)s "
query += ";"
query = f"""
WITH q AS (
SELECT convert_query(${"%(text_query)s" if ts_query is not None else ""})
)
(SELECT *, ts_headline(transcription_line, (SELECT * FROM q),
'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text
FROM buscribe_all_transcriptions2
WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s
${"AND verified_line_ts @@ (SELECT * FROM q)" if ts_query is not None else ""}
ORDER BY ${"ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) ||" +
"coalesce(names_ts, ''::tsvector), (SELECT * FROM q)) DESC," if ts_query is not None else ""}
start_time)
UNION
(SELECT *, ts_headline(transcription_line, (SELECT * FROM q),
'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text
FROM buscribe_all_transcriptions2
WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s
${"AND machine_line_ts @@ (SELECT * FROM q)" if ts_query is not None else ""}
ORDER BY ${"ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) ||" +
"coalesce(names_ts, ''::tsvector), (SELECT * FROM q)) DESC," if ts_query is not None else ""}
start_time)
"""
return database.query(db_conn, query,
start_time=start_time if start_time is not None else '-infinity',
end_time=end_time if end_time is not None else 'infinity',
text_query=ts_query,
limit=limit,
offset=offset
)