From 5260fb60c01023cb137dd11fb26290c25af3ede5 Mon Sep 17 00:00:00 2001 From: HeNine <> Date: Wed, 17 Nov 2021 13:40:40 +0100 Subject: [PATCH] search optimization 5: the morning after --- buscribe-api/buscribeapi/buscribeapi.py | 142 +++++++++++++++++------- buscribe_data.sql | 6 +- 2 files changed, 102 insertions(+), 46 deletions(-) diff --git a/buscribe-api/buscribeapi/buscribeapi.py b/buscribe-api/buscribeapi/buscribeapi.py index 2c78810..84b4c40 100644 --- a/buscribe-api/buscribeapi/buscribeapi.py +++ b/buscribe-api/buscribeapi/buscribeapi.py @@ -118,51 +118,107 @@ def get_json(): def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None): - if ts_query is None: - query = "SELECT *" + \ - ",transcription_line AS highlighted_text" + \ - " FROM buscribe_all_transcriptions WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s " + \ - "ORDER BY start_time " - - if limit is not None: - query += "LIMIT %(limit)s " - - if offset is not None: - query += "OFFSET %(limit)s " - - query += ";" - - else: - query = f""" - WITH q AS ( - SELECT convert_query({"%(text_query)s" if ts_query is not None else "NULL"}) + query = f""" + WITH q AS ( + SELECT convert_query(%(text_query)s) +), + time_window AS ( + SELECT id + FROM buscribe_transcriptions + WHERE start_time >= %(start_time)s + AND end_time <= %(end_time)s + ), + relevant_lines AS ( + ( + SELECT id + FROM buscribe_transcriptions + WHERE id IN (SELECT id FROM time_window) + {"AND to_tsvector('english', transcription_line) @@ (SELECT * FROM q)" if ts_query else ""} + ) + UNION + ( + SELECT line + FROM buscribe_verified_lines + WHERE line IN (SELECT id FROM time_window) + {"AND to_tsvector('english', verified_line) @@ (SELECT * FROM q)" if ts_query else ""} + ) + UNION + ( + SELECT line + FROM buscribe_line_speakers + INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id + WHERE line IN (SELECT id FROM time_window) + {"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""} + ) + UNION + ( + SELECT line + FROM buscribe_line_inferred_speakers + INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id + WHERE line IN (SELECT id FROM time_window) + {"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""} + ) + ) + ( + (SELECT id, + start_time, + end_time, + null AS verifier, + names, + transcription_line, + ts_rank_cd(coalesce(to_tsvector('english', transcription_line), ''::tsvector) || + coalesce(to_tsvector(array_to_string(names, ' ')), ''::tsvector), (SELECT * FROM q)) AS rank, + transcription_json + FROM buscribe_transcriptions + LEFT OUTER JOIN (SELECT line, array_agg(name) AS names + FROM buscribe_line_inferred_speakers + INNER JOIN buscribe_speakers + ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id + GROUP BY line) AS inferred_speakers ON id = inferred_speakers.line + WHERE id IN (SELECT id FROM relevant_lines) ) - (SELECT *, ts_headline(transcription_line, (SELECT * FROM q), - 'StartSel='''', StopSel=') AS highlighted_text - FROM buscribe_all_transcriptions2 - WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s - {"AND verified_line_ts @@ (SELECT * FROM q)" if ts_query is not None else ""} - ORDER BY {"ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) ||" + - "coalesce(names_ts, ''::tsvector), (SELECT * FROM q)) DESC," if ts_query is not None else ""} - start_time) UNION - (SELECT *, ts_headline(transcription_line, (SELECT * FROM q), - 'StartSel='''', StopSel=') AS highlighted_text - FROM buscribe_all_transcriptions2 - WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s - {"AND machine_line_ts @@ (SELECT * FROM q)" if ts_query is not None else ""} - ORDER BY {"ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) ||" + - "coalesce(names_ts, ''::tsvector), (SELECT * FROM q)) DESC," if ts_query is not None else ""} - start_time) - """ - - if limit is not None: - query += "LIMIT %(limit)s " - - if offset is not None: - query += "OFFSET %(limit)s " - - query += ";" + ( + SELECT buscribe_transcriptions.id AS id, + start_time, + end_time, + cverifier AS verifier, + names, + coalesce(verifications.verified_line, + buscribe_transcriptions.transcription_line) AS transcription_line, + ts_rank_cd(coalesce( + setweight(to_tsvector('english', verified_line), 'C'), + to_tsvector('english', buscribe_transcriptions.transcription_line), + ''::tsvector) || + coalesce(setweight(to_tsvector(array_to_string(names, ' ')), 'C'), ''::tsvector), + (SELECT * FROM q)) AS rank, + null AS transcription_json + FROM buscribe_transcriptions + INNER JOIN ( + SELECT *, + coalesce(relevant_verified.line, relevant_speakers.line) AS cline, + coalesce(relevant_verified.verifier, relevant_speakers.verifier) AS cverifier + FROM (SELECT * + FROM buscribe_verified_lines + WHERE line IN (SELECT id FROM relevant_lines)) AS relevant_verified + FULL OUTER JOIN + (SELECT line, verifier, array_agg(name) AS names + FROM buscribe_line_speakers + INNER JOIN buscribe_speakers + ON buscribe_line_speakers.speaker = buscribe_speakers.id + WHERE line IN (SELECT id FROM relevant_lines) + GROUP BY line, verifier) AS relevant_speakers + ON relevant_verified.line = relevant_speakers.line AND + relevant_speakers.verifier = relevant_verified.verifier) AS verifications + ON id = verifications.cline + ) + ) + ORDER BY + {"rank DESC," if ts_query is not None else ""} + start_time + {"OFFSET 0" if offset is not None else ""} + {"LIMIT 10" if limit is not None else ""}; + """ return database.query(db_conn, query, start_time=start_time if start_time is not None else '-infinity', diff --git a/buscribe_data.sql b/buscribe_data.sql index aae2f6f..b69c379 100644 --- a/buscribe_data.sql +++ b/buscribe_data.sql @@ -136,8 +136,8 @@ SELECT buscribe_transcriptions.id, coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier, names, coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line, - to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts, - setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts, + to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts, + setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts, coalesce(setweight(to_tsvector('english', verified_line), 'C'), to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts, setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts, @@ -162,7 +162,7 @@ SELECT id, names, transcription_line, to_tsvector('english', transcription_line) AS machine_line_ts, - null AS verified_line_ts, + null AS verified_line_ts, to_tsvector('english', transcription_line) AS transcription_line_ts, null AS names_ts, transcription_json