From 6c9873ac60ec161522be3dd19d8647621120596f Mon Sep 17 00:00:00 2001 From: HeNine <> Date: Wed, 20 Oct 2021 15:06:48 +0200 Subject: [PATCH] Add hand-transcribed lines to search Add partial word search --- buscribe-api/buscribeapi/buscribeapi.py | 14 +++++++------ buscribe-api/buscribeapi/main.py | 2 +- buscribe-web/line.less | 4 ++++ buscribe-web/script.js | 5 ++++- buscribe_data.sql | 27 +++++++++++++++++++++++++ 5 files changed, 44 insertions(+), 8 deletions(-) diff --git a/buscribe-api/buscribeapi/buscribeapi.py b/buscribe-api/buscribeapi/buscribeapi.py index a099db3..fb7c4af 100644 --- a/buscribe-api/buscribeapi/buscribeapi.py +++ b/buscribe-api/buscribeapi/buscribeapi.py @@ -88,20 +88,24 @@ def get_json(): results = fetch_lines(db_conn, start_time, end_time, query, limit, offset) - return jsonify([{"start_time": row.start_time.isoformat(), + return jsonify([{"id": row.id, + "start_time": row.start_time.isoformat(), "start_bus_time": round_bus_time(row.start_time - app.bustime_start), "end_time": row.end_time.isoformat(), "end_bus_time": round_bus_time(row.start_time - app.bustime_start), + "verifier": row.verifier, "text": row.transcription_line} for row in results]) def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None): - query = "SELECT * FROM buscribe_transcriptions WHERE start_time > %(start_time)s AND end_time < %(end_time)s " + query = "SELECT * FROM buscribe_all_transcriptions WHERE start_time > %(start_time)s AND end_time < %(end_time)s " if ts_query is not None: - query += "AND to_tsvector(transcription_line) @@ websearch_to_tsquery(%(text_query)s) " \ - "ORDER BY ts_rank_cd(to_tsvector(transcription_line), websearch_to_tsquery(%(text_query)s)) DESC, " \ + query += "AND transcription_line_ts @@ (websearch_to_tsquery(%(text_query)s)::text ||':*')::tsquery " \ + "ORDER BY ts_rank_cd(transcription_line_ts, (websearch_to_tsquery(%(text_query)s)::text ||':*')::tsquery) DESC, " \ "start_time" + else: + query += "ORDER BY start_time" if limit is not None: query += "LIMIT %(limit)s" @@ -111,8 +115,6 @@ def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset query += ";" - print(query) - return database.query(db_conn, query, start_time=start_time if start_time is not None else '-infinity', end_time=end_time if end_time is not None else 'infinity', diff --git a/buscribe-api/buscribeapi/main.py b/buscribe-api/buscribeapi/main.py index 369b679..62ed766 100644 --- a/buscribe-api/buscribeapi/main.py +++ b/buscribe-api/buscribeapi/main.py @@ -47,7 +47,7 @@ def servelet(server): 'postgresql://USER:PASSWORD@HOST/DBNAME?KEY=VALUE') @argh.arg('--bustime-start', help='The start time in UTC for the event, for UTC-Bustime conversion') -def main(database="", host='0.0.0.0', port=8005, bustime_start=None): +def main(database="", host='0.0.0.0', port=8010, bustime_start=None): if bustime_start is None: logging.error("Missing --bustime-start!") exit(1) diff --git a/buscribe-web/line.less b/buscribe-web/line.less index 63bceda..6ec4fe1 100644 --- a/buscribe-web/line.less +++ b/buscribe-web/line.less @@ -41,4 +41,8 @@ grid-column: text; grid-row: span 2; } +} + +.line.verified { + background: #555; } \ No newline at end of file diff --git a/buscribe-web/script.js b/buscribe-web/script.js index 6bbe36a..e24e96f 100644 --- a/buscribe-web/script.js +++ b/buscribe-web/script.js @@ -19,7 +19,7 @@ function query(text, start_time, end_time) { query_string += `&query=${text}` } - fetch(`http://localhost:8005/buscribe/json?${query_string}`) + fetch(`http://localhost:8010/buscribe/json?${query_string}`) .then(response => response.json()) // .then(response => console.log(response.error())) .then(fillResults) @@ -42,6 +42,9 @@ function fillResults(results) { const line_div = document.createElement("div"); line_div.classList.add("line"); + if (line.verifier) { + line_div.classList.add("verified"); + } line_div.innerHTML = `
${line.start_bus_time}
diff --git a/buscribe_data.sql b/buscribe_data.sql index 21c849e..308aaaa 100644 --- a/buscribe_data.sql +++ b/buscribe_data.sql @@ -69,3 +69,30 @@ CREATE TABLE buscribe_verified_lines verifier text REFERENCES buscribe_verifiers, PRIMARY KEY (line, verifier) ); + +-- Indexed with C weight (0.2 vs default 0.1) +CREATE INDEX buscribe_verified_lines_idx ON buscribe_verified_lines USING + GIN (setweight(to_tsvector('english', verified_line), 'C')); + +BEGIN; +DROP VIEW buscribe_all_transcriptions; +CREATE VIEW buscribe_all_transcriptions AS +SELECT "id", + start_time, + end_time, + null AS verifier, + transcription_line, + to_tsvector('english', transcription_line) AS transcription_line_ts +FROM buscribe_transcriptions +UNION +SELECT "id", + start_time, + end_time, + verifier, + verified_line AS transcription_line, + setweight(to_tsvector('english', verified_line), 'C') AS transcription_line_ts +FROM buscribe_verified_lines + INNER JOIN buscribe_transcriptions ON (line = "id") +ORDER BY "id"; + +ROLLBACK; \ No newline at end of file