mirror of https://github.com/ekimekim/wubloader
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
186 lines
7.2 KiB
PL/PgSQL
186 lines
7.2 KiB
PL/PgSQL
BEGIN TRANSACTION;
|
|
|
|
DROP TABLE IF EXISTS buscribe_verified_lines;
|
|
DROP TABLE IF EXISTS buscribe_line_speakers;
|
|
DROP TABLE IF EXISTS buscribe_speakers;
|
|
DROP TABLE IF EXISTS buscribe_verifiers;
|
|
DROP TABLE IF EXISTS buscribe_transcriptions;
|
|
|
|
ROLLBACK;
|
|
|
|
BEGIN TRANSACTION;
|
|
|
|
TRUNCATE buscribe_verified_lines RESTART IDENTITY CASCADE;
|
|
TRUNCATE buscribe_line_speakers RESTART IDENTITY CASCADE;
|
|
TRUNCATE buscribe_speakers RESTART IDENTITY CASCADE;
|
|
TRUNCATE buscribe_verifiers RESTART IDENTITY CASCADE;
|
|
TRUNCATE buscribe_transcriptions RESTART IDENTITY CASCADE;
|
|
|
|
ROLLBACK;
|
|
|
|
CREATE TABLE buscribe_transcriptions
|
|
(
|
|
id BIGSERIAL PRIMARY KEY,
|
|
start_time timestamp without time zone NOT NULL,
|
|
end_time timestamp without time zone NOT NULL,
|
|
transcription_line text NOT NULL,
|
|
line_speaker float[128],
|
|
transcription_json jsonb NOT NULL
|
|
);
|
|
|
|
CREATE INDEX buscribe_transcriptions_idx ON buscribe_transcriptions USING
|
|
GIN (to_tsvector('english', transcription_line));
|
|
|
|
-- This might not actually be needed. Check once there is more data.
|
|
CREATE INDEX buscribe_start_time_idx ON buscribe_transcriptions (start_time);
|
|
CREATE INDEX buscribe_end_time_idx ON buscribe_transcriptions (end_time);
|
|
|
|
CREATE TABLE buscribe_speakers
|
|
(
|
|
id BIGSERIAL PRIMARY KEY,
|
|
name text NOT NULL UNIQUE CHECK ( name != '' )
|
|
);
|
|
|
|
CREATE TABLE buscribe_verifiers
|
|
(
|
|
-- id SERIAL PRIMARY KEY,
|
|
email TEXT PRIMARY KEY,
|
|
name TEXT NOT NULL
|
|
);
|
|
|
|
-- For testing
|
|
-- INSERT INTO buscribe_verifiers(email, name)
|
|
-- VALUES ('placeholder@example.com', 'Place Holder'),
|
|
-- ('aguy@example.com', 'Arnold Guyana');
|
|
|
|
CREATE TABLE buscribe_line_speakers
|
|
(
|
|
-- id BIGSERIAL PRIMARY KEY,
|
|
line BIGINT NOT NULL REFERENCES buscribe_transcriptions,
|
|
speaker BIGINT NOT NULL REFERENCES buscribe_speakers,
|
|
verifier text NOT NULL REFERENCES buscribe_verifiers,
|
|
PRIMARY KEY (line, speaker, verifier)
|
|
);
|
|
|
|
CREATE TABLE buscribe_line_inferred_speakers
|
|
(
|
|
line BIGINT NOT NULL REFERENCES buscribe_transcriptions,
|
|
speaker BIGINT NOT NULL REFERENCES buscribe_speakers,
|
|
PRIMARY KEY (line, speaker)
|
|
);
|
|
|
|
CREATE TABLE buscribe_verified_lines
|
|
(
|
|
-- id BIGSERIAL PRIMARY KEY,
|
|
line BIGINT NOT NULL REFERENCES buscribe_transcriptions,
|
|
verified_line TEXT NOT NULL,
|
|
verifier text REFERENCES buscribe_verifiers,
|
|
PRIMARY KEY (line, verifier)
|
|
);
|
|
|
|
-- Indexed with C weight (0.2 vs default 0.1)
|
|
CREATE INDEX buscribe_verified_lines_idx ON buscribe_verified_lines USING
|
|
GIN (setweight(to_tsvector('english', verified_line), 'C'));
|
|
|
|
BEGIN;
|
|
|
|
DROP VIEW buscribe_all_transcriptions;
|
|
|
|
CREATE VIEW buscribe_all_transcriptions AS
|
|
SELECT buscribe_transcriptions.id,
|
|
start_time,
|
|
end_time,
|
|
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
|
|
names,
|
|
coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line,
|
|
coalesce(setweight(to_tsvector('english', verified_line), 'C'),
|
|
to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts,
|
|
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
|
|
null AS transcription_json
|
|
FROM buscribe_transcriptions
|
|
LEFT OUTER JOIN buscribe_verified_lines ON buscribe_transcriptions.id = buscribe_verified_lines.line
|
|
LEFT OUTER JOIN (
|
|
SELECT line, verifier, array_agg(name) AS names
|
|
FROM buscribe_line_speakers
|
|
INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id
|
|
GROUP BY line, verifier
|
|
) AS speakers ON buscribe_transcriptions.id = speakers.line AND (
|
|
speakers.verifier = buscribe_verified_lines.verifier OR
|
|
buscribe_verified_lines.verifier IS NULL
|
|
)
|
|
WHERE coalesce(buscribe_verified_lines.verifier, speakers.verifier) IS NOT NULL
|
|
UNION
|
|
SELECT id,
|
|
start_time,
|
|
end_time,
|
|
null AS verifier,
|
|
names,
|
|
transcription_line,
|
|
to_tsvector('english', transcription_line) AS transcription_line_ts,
|
|
null AS names_ts,
|
|
transcription_json
|
|
FROM buscribe_transcriptions
|
|
LEFT OUTER JOIN (
|
|
SELECT line, array_agg(name) AS names
|
|
FROM buscribe_line_inferred_speakers
|
|
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
|
|
GROUP BY line
|
|
) AS speakers ON id = speakers.line;
|
|
|
|
ROLLBACK;
|
|
|
|
CREATE VIEW buscribe_all_transcriptions2 AS
|
|
SELECT buscribe_transcriptions.id,
|
|
start_time,
|
|
end_time,
|
|
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
|
|
names,
|
|
coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line,
|
|
to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts,
|
|
setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts,
|
|
coalesce(setweight(to_tsvector('english', verified_line), 'C'),
|
|
to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts,
|
|
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
|
|
null AS transcription_json
|
|
FROM buscribe_transcriptions
|
|
LEFT OUTER JOIN buscribe_verified_lines ON buscribe_transcriptions.id = buscribe_verified_lines.line
|
|
LEFT OUTER JOIN (
|
|
SELECT line, verifier, array_agg(name) AS names
|
|
FROM buscribe_line_speakers
|
|
INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id
|
|
GROUP BY line, verifier
|
|
) AS speakers ON buscribe_transcriptions.id = speakers.line AND (
|
|
speakers.verifier = buscribe_verified_lines.verifier OR
|
|
buscribe_verified_lines.verifier IS NULL
|
|
)
|
|
WHERE coalesce(buscribe_verified_lines.verifier, speakers.verifier) IS NOT NULL
|
|
UNION
|
|
SELECT id,
|
|
start_time,
|
|
end_time,
|
|
null AS verifier,
|
|
names,
|
|
transcription_line,
|
|
to_tsvector('english', transcription_line) AS machine_line_ts,
|
|
null AS verified_line_ts,
|
|
to_tsvector('english', transcription_line) AS transcription_line_ts,
|
|
null AS names_ts,
|
|
transcription_json
|
|
FROM buscribe_transcriptions
|
|
LEFT OUTER JOIN (
|
|
SELECT line, array_agg(name) AS names
|
|
FROM buscribe_line_inferred_speakers
|
|
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
|
|
GROUP BY line
|
|
) AS speakers ON id = speakers.line;
|
|
|
|
-- Convert last lexeme in a query to prefix query.
|
|
CREATE FUNCTION convert_query(query_text text) RETURNS tsquery AS
|
|
$$
|
|
DECLARE
|
|
ws_query text := websearch_to_tsquery(query_text)::text;
|
|
BEGIN
|
|
RETURN (CASE WHEN ws_query != '' THEN ws_query || ':*' ELSE '' END)::tsquery;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|