search optimization 5: the morning after

trunk
HeNine 3 years ago
parent 881712ed29
commit 5260fb60c0

@ -118,51 +118,107 @@ def get_json():
def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None): def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None):
if ts_query is None: query = f"""
query = "SELECT *" + \ WITH q AS (
",transcription_line AS highlighted_text" + \ SELECT convert_query(%(text_query)s)
" FROM buscribe_all_transcriptions WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s " + \ ),
"ORDER BY start_time " time_window AS (
SELECT id
if limit is not None: FROM buscribe_transcriptions
query += "LIMIT %(limit)s " WHERE start_time >= %(start_time)s
AND end_time <= %(end_time)s
if offset is not None: ),
query += "OFFSET %(limit)s " relevant_lines AS (
(
query += ";" SELECT id
FROM buscribe_transcriptions
else: WHERE id IN (SELECT id FROM time_window)
query = f""" {"AND to_tsvector('english', transcription_line) @@ (SELECT * FROM q)" if ts_query else ""}
WITH q AS ( )
SELECT convert_query({"%(text_query)s" if ts_query is not None else "NULL"}) UNION
(
SELECT line
FROM buscribe_verified_lines
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector('english', verified_line) @@ (SELECT * FROM q)" if ts_query else ""}
)
UNION
(
SELECT line
FROM buscribe_line_speakers
INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""}
)
UNION
(
SELECT line
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""}
)
)
(
(SELECT id,
start_time,
end_time,
null AS verifier,
names,
transcription_line,
ts_rank_cd(coalesce(to_tsvector('english', transcription_line), ''::tsvector) ||
coalesce(to_tsvector(array_to_string(names, ' ')), ''::tsvector), (SELECT * FROM q)) AS rank,
transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN (SELECT line, array_agg(name) AS names
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers
ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
GROUP BY line) AS inferred_speakers ON id = inferred_speakers.line
WHERE id IN (SELECT id FROM relevant_lines)
) )
(SELECT *, ts_headline(transcription_line, (SELECT * FROM q),
'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text
FROM buscribe_all_transcriptions2
WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s
{"AND verified_line_ts @@ (SELECT * FROM q)" if ts_query is not None else ""}
ORDER BY {"ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) ||" +
"coalesce(names_ts, ''::tsvector), (SELECT * FROM q)) DESC," if ts_query is not None else ""}
start_time)
UNION UNION
(SELECT *, ts_headline(transcription_line, (SELECT * FROM q), (
'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text SELECT buscribe_transcriptions.id AS id,
FROM buscribe_all_transcriptions2 start_time,
WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s end_time,
{"AND machine_line_ts @@ (SELECT * FROM q)" if ts_query is not None else ""} cverifier AS verifier,
ORDER BY {"ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) ||" + names,
"coalesce(names_ts, ''::tsvector), (SELECT * FROM q)) DESC," if ts_query is not None else ""} coalesce(verifications.verified_line,
start_time) buscribe_transcriptions.transcription_line) AS transcription_line,
""" ts_rank_cd(coalesce(
setweight(to_tsvector('english', verified_line), 'C'),
if limit is not None: to_tsvector('english', buscribe_transcriptions.transcription_line),
query += "LIMIT %(limit)s " ''::tsvector) ||
coalesce(setweight(to_tsvector(array_to_string(names, ' ')), 'C'), ''::tsvector),
if offset is not None: (SELECT * FROM q)) AS rank,
query += "OFFSET %(limit)s " null AS transcription_json
FROM buscribe_transcriptions
query += ";" INNER JOIN (
SELECT *,
coalesce(relevant_verified.line, relevant_speakers.line) AS cline,
coalesce(relevant_verified.verifier, relevant_speakers.verifier) AS cverifier
FROM (SELECT *
FROM buscribe_verified_lines
WHERE line IN (SELECT id FROM relevant_lines)) AS relevant_verified
FULL OUTER JOIN
(SELECT line, verifier, array_agg(name) AS names
FROM buscribe_line_speakers
INNER JOIN buscribe_speakers
ON buscribe_line_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM relevant_lines)
GROUP BY line, verifier) AS relevant_speakers
ON relevant_verified.line = relevant_speakers.line AND
relevant_speakers.verifier = relevant_verified.verifier) AS verifications
ON id = verifications.cline
)
)
ORDER BY
{"rank DESC," if ts_query is not None else ""}
start_time
{"OFFSET 0" if offset is not None else ""}
{"LIMIT 10" if limit is not None else ""};
"""
return database.query(db_conn, query, return database.query(db_conn, query,
start_time=start_time if start_time is not None else '-infinity', start_time=start_time if start_time is not None else '-infinity',

@ -136,8 +136,8 @@ SELECT buscribe_transcriptions.id,
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier, coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
names, names,
coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line, coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line,
to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts, to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts,
setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts, setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts,
coalesce(setweight(to_tsvector('english', verified_line), 'C'), coalesce(setweight(to_tsvector('english', verified_line), 'C'),
to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts, to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts,
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts, setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
@ -162,7 +162,7 @@ SELECT id,
names, names,
transcription_line, transcription_line,
to_tsvector('english', transcription_line) AS machine_line_ts, to_tsvector('english', transcription_line) AS machine_line_ts,
null AS verified_line_ts, null AS verified_line_ts,
to_tsvector('english', transcription_line) AS transcription_line_ts, to_tsvector('english', transcription_line) AS transcription_line_ts,
null AS names_ts, null AS names_ts,
transcription_json transcription_json

Loading…
Cancel
Save