From 5260fb60c01023cb137dd11fb26290c25af3ede5 Mon Sep 17 00:00:00 2001
From: HeNine <>
Date: Wed, 17 Nov 2021 13:40:40 +0100
Subject: [PATCH] search optimization 5: the morning after
---
buscribe-api/buscribeapi/buscribeapi.py | 142 +++++++++++++++++-------
buscribe_data.sql | 6 +-
2 files changed, 102 insertions(+), 46 deletions(-)
diff --git a/buscribe-api/buscribeapi/buscribeapi.py b/buscribe-api/buscribeapi/buscribeapi.py
index 2c78810..84b4c40 100644
--- a/buscribe-api/buscribeapi/buscribeapi.py
+++ b/buscribe-api/buscribeapi/buscribeapi.py
@@ -118,51 +118,107 @@ def get_json():
def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None):
- if ts_query is None:
- query = "SELECT *" + \
- ",transcription_line AS highlighted_text" + \
- " FROM buscribe_all_transcriptions WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s " + \
- "ORDER BY start_time "
-
- if limit is not None:
- query += "LIMIT %(limit)s "
-
- if offset is not None:
- query += "OFFSET %(limit)s "
-
- query += ";"
-
- else:
- query = f"""
- WITH q AS (
- SELECT convert_query({"%(text_query)s" if ts_query is not None else "NULL"})
+ query = f"""
+ WITH q AS (
+ SELECT convert_query(%(text_query)s)
+),
+ time_window AS (
+ SELECT id
+ FROM buscribe_transcriptions
+ WHERE start_time >= %(start_time)s
+ AND end_time <= %(end_time)s
+ ),
+ relevant_lines AS (
+ (
+ SELECT id
+ FROM buscribe_transcriptions
+ WHERE id IN (SELECT id FROM time_window)
+ {"AND to_tsvector('english', transcription_line) @@ (SELECT * FROM q)" if ts_query else ""}
+ )
+ UNION
+ (
+ SELECT line
+ FROM buscribe_verified_lines
+ WHERE line IN (SELECT id FROM time_window)
+ {"AND to_tsvector('english', verified_line) @@ (SELECT * FROM q)" if ts_query else ""}
+ )
+ UNION
+ (
+ SELECT line
+ FROM buscribe_line_speakers
+ INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id
+ WHERE line IN (SELECT id FROM time_window)
+ {"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""}
+ )
+ UNION
+ (
+ SELECT line
+ FROM buscribe_line_inferred_speakers
+ INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
+ WHERE line IN (SELECT id FROM time_window)
+ {"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""}
+ )
+ )
+ (
+ (SELECT id,
+ start_time,
+ end_time,
+ null AS verifier,
+ names,
+ transcription_line,
+ ts_rank_cd(coalesce(to_tsvector('english', transcription_line), ''::tsvector) ||
+ coalesce(to_tsvector(array_to_string(names, ' ')), ''::tsvector), (SELECT * FROM q)) AS rank,
+ transcription_json
+ FROM buscribe_transcriptions
+ LEFT OUTER JOIN (SELECT line, array_agg(name) AS names
+ FROM buscribe_line_inferred_speakers
+ INNER JOIN buscribe_speakers
+ ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
+ GROUP BY line) AS inferred_speakers ON id = inferred_speakers.line
+ WHERE id IN (SELECT id FROM relevant_lines)
)
- (SELECT *, ts_headline(transcription_line, (SELECT * FROM q),
- 'StartSel='''', StopSel=') AS highlighted_text
- FROM buscribe_all_transcriptions2
- WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s
- {"AND verified_line_ts @@ (SELECT * FROM q)" if ts_query is not None else ""}
- ORDER BY {"ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) ||" +
- "coalesce(names_ts, ''::tsvector), (SELECT * FROM q)) DESC," if ts_query is not None else ""}
- start_time)
UNION
- (SELECT *, ts_headline(transcription_line, (SELECT * FROM q),
- 'StartSel='''', StopSel=') AS highlighted_text
- FROM buscribe_all_transcriptions2
- WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s
- {"AND machine_line_ts @@ (SELECT * FROM q)" if ts_query is not None else ""}
- ORDER BY {"ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) ||" +
- "coalesce(names_ts, ''::tsvector), (SELECT * FROM q)) DESC," if ts_query is not None else ""}
- start_time)
- """
-
- if limit is not None:
- query += "LIMIT %(limit)s "
-
- if offset is not None:
- query += "OFFSET %(limit)s "
-
- query += ";"
+ (
+ SELECT buscribe_transcriptions.id AS id,
+ start_time,
+ end_time,
+ cverifier AS verifier,
+ names,
+ coalesce(verifications.verified_line,
+ buscribe_transcriptions.transcription_line) AS transcription_line,
+ ts_rank_cd(coalesce(
+ setweight(to_tsvector('english', verified_line), 'C'),
+ to_tsvector('english', buscribe_transcriptions.transcription_line),
+ ''::tsvector) ||
+ coalesce(setweight(to_tsvector(array_to_string(names, ' ')), 'C'), ''::tsvector),
+ (SELECT * FROM q)) AS rank,
+ null AS transcription_json
+ FROM buscribe_transcriptions
+ INNER JOIN (
+ SELECT *,
+ coalesce(relevant_verified.line, relevant_speakers.line) AS cline,
+ coalesce(relevant_verified.verifier, relevant_speakers.verifier) AS cverifier
+ FROM (SELECT *
+ FROM buscribe_verified_lines
+ WHERE line IN (SELECT id FROM relevant_lines)) AS relevant_verified
+ FULL OUTER JOIN
+ (SELECT line, verifier, array_agg(name) AS names
+ FROM buscribe_line_speakers
+ INNER JOIN buscribe_speakers
+ ON buscribe_line_speakers.speaker = buscribe_speakers.id
+ WHERE line IN (SELECT id FROM relevant_lines)
+ GROUP BY line, verifier) AS relevant_speakers
+ ON relevant_verified.line = relevant_speakers.line AND
+ relevant_speakers.verifier = relevant_verified.verifier) AS verifications
+ ON id = verifications.cline
+ )
+ )
+ ORDER BY
+ {"rank DESC," if ts_query is not None else ""}
+ start_time
+ {"OFFSET 0" if offset is not None else ""}
+ {"LIMIT 10" if limit is not None else ""};
+ """
return database.query(db_conn, query,
start_time=start_time if start_time is not None else '-infinity',
diff --git a/buscribe_data.sql b/buscribe_data.sql
index aae2f6f..b69c379 100644
--- a/buscribe_data.sql
+++ b/buscribe_data.sql
@@ -136,8 +136,8 @@ SELECT buscribe_transcriptions.id,
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
names,
coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line,
- to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts,
- setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts,
+ to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts,
+ setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts,
coalesce(setweight(to_tsvector('english', verified_line), 'C'),
to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts,
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
@@ -162,7 +162,7 @@ SELECT id,
names,
transcription_line,
to_tsvector('english', transcription_line) AS machine_line_ts,
- null AS verified_line_ts,
+ null AS verified_line_ts,
to_tsvector('english', transcription_line) AS transcription_line_ts,
null AS names_ts,
transcription_json