Compare commits

...

43 Commits

Author SHA1 Message Date
Matija Rezar fbb6c4dca0 current stat dump 3 months ago
HeNine 0355c59ee8 search optimization 7 3 years ago
HeNine 7673c900ae search optimization 6: aaaaaaaa 3 years ago
HeNine 5260fb60c0 search optimization 5: the morning after 3 years ago
HeNine 881712ed29 search optimization 4 i give up 3 years ago
HeNine 8733526c01 search optimization 3 3 years ago
HeNine dd4dede5cd search optimization 2 3 years ago
HeNine 8a6f32975a search optimization 3 years ago
HeNine 8f6fc2b722 hotkeys 3 years ago
HeNine 0c43bcb714 get random line for tagging 3 years ago
HeNine ec2929c99f buscribe api segment path 3 years ago
HeNine 073d6c8769 improved display when speakers are set but line isn't 3 years ago
HeNine 7c60a407be blugh 3 years ago
HeNine ae1ca08dbe add line links 3 years ago
HeNine 9f4f15232f professor deployment 3 years ago
HeNine 46d228c42e preparing professor deployment 3 years ago
HeNine 770a97387a other authentication 3 years ago
HeNine fb2132dd16 front end authentication 3 years ago
HeNine 74155a7f6c idk what google is doing 3 years ago
HeNine 7a3328f5f3 show line context 3 years ago
HeNine e6325b31c8 Templates included. Closes #6 3 years ago
HeNine 2125f9d263 goating there 3 years ago
HeNine 87d6849670 Getting DB transcription up and running 3 years ago
HeNine 22cf34641b Cosmetics 3 years ago
HeNine c0b334695a time-related unbusening, cont. 3 years ago
HeNine 022c271e07 time-related unbusening 3 years ago
HeNine a1a0a47a72 git also sucks 3 years ago
HeNine 337e1fcc23 final touches closes #4 3 years ago
HeNine 22e270b6ff #4 Add bustime search options 3 years ago
HeNine 753784657e channel selection closes #5 3 years ago
HeNine 0a3fc28a01 prepare for channel selection 3 years ago
HeNine f217931fa3 bustime format 3 years ago
HeNine 481581b84f nginx fixest 3 years ago
HeNine 9f90fb822c nginx fixer 3 years ago
HeNine 0e9bf87d61 nginx fix 3 years ago
HeNine 61c69db72f more waiting 3 years ago
HeNine f7c05ff53d less 3 years ago
HeNine 090037f261 End of segments weirdness 3 years ago
HeNine 11954d1a31 Docker compose attempt 1 3 years ago
HeNine 2f4189bc31 One more database fix 3 years ago
HeNine 372c96f29d Update port to conform to wub 3 years ago
HeNine cc49374096 Change start time priority to make database take priority 3 years ago
HeNine 50c4d8a096 Final result sometimes has no content (if segments are missing after silence?) 3 years ago

@ -0,0 +1 @@
models/

@ -0,0 +1,15 @@
#!/bin/bash
VERSION=0.0.0
#bash fetch_models.sh
docker build -f buscribe/Dockerfile -t buscribe:$VERSION .
docker build -f buscribe-api/Dockerfile -t buscribe-api:$VERSION .
docker build -f professor-api/Dockerfile -t professor-api:$VERSION .
docker build -f docker-less/Dockerfile -t lessc .
docker run --rm -v "$(pwd)"/buscribe-web:/buscribe-web lessc /buscribe-web/style.less > buscribe-web/style.css
docker run --rm -v "$(pwd)"/professor:/professor lessc /professor/style.less > professor/style.css
docker build -f nginx/Dockerfile -t buscribe-web:$VERSION .

@ -13,6 +13,7 @@ RUN pip install /tmp/common && rm -r /tmp/common
# Install actual application
RUN apk add postgresql-dev postgresql-libs
COPY buscribe-api /tmp/buscribe-api
RUN pip install /tmp/buscribe-api && rm -r /tmp/buscribe-api
RUN pip install /tmp/buscribe-api && cp -r /tmp/buscribe-api/templates /templates \
&& rm -r /tmp/buscribe-api
ENTRYPOINT ["python3", "-m", "buscribeapi"]
ENTRYPOINT ["python3", "-m", "buscribeapi", "--base-dir", "/mnt"]

@ -1,9 +1,8 @@
import json
from datetime import timedelta
import flask as flask
import common
from common import dateutil, database
import flask as flask
from common import dateutil, database, format_bustime, dt_to_bustime, bustime_to_dt, parse_bustime
from dateutil.parser import ParserError
from flask import request, jsonify, Response, render_template
@ -22,11 +21,6 @@ def create_seconds_timedelta(seconds):
return timedelta(seconds=seconds)
def round_bus_time(delta: timedelta):
"""Round bus time down to the second."""
return f'{delta.days * 24 + delta.seconds // 3600:02}:{(delta.seconds % 3600) // 60:02}:{delta.seconds % 60:02}'
@app.route('/buscribe/vtt')
def get_vtt():
"""Returns WebVTT subtitle file for the period between start_time and end_time.
@ -74,20 +68,32 @@ def get_json():
(https://www.postgresql.org/docs/13/functions-textsearch.html)"""
start_time_string = request.args.get('start_time')
bus_start_time_string = request.args.get('bus_start_time')
if start_time_string is not None:
try:
start_time = dateutil.parse(start_time_string)
except ParserError:
return "Invalid start time!", 400
elif bus_start_time_string is not None:
try:
start_time = bustime_to_dt(app.bustime_start, parse_bustime(bus_start_time_string))
except ValueError:
return "Invalid bus end time!", 400
else:
start_time = None
end_time_string = request.args.get('end_time')
bus_end_time_string = request.args.get('bus_end_time')
if end_time_string is not None:
try:
end_time = dateutil.parse(end_time_string)
except ParserError:
return "Invalid end time!", 400
elif bus_end_time_string is not None:
try:
end_time = bustime_to_dt(app.bustime_start, parse_bustime(bus_end_time_string))
except ValueError:
return "Invalid bus end time!", 400
else:
end_time = None
@ -103,35 +109,120 @@ def get_json():
return jsonify([{"id": row.id,
"start_time": row.start_time.isoformat(),
"start_bus_time": round_bus_time(row.start_time - app.bustime_start),
"start_bus_time": format_bustime(dt_to_bustime(app.bustime_start, row.start_time), "second"),
"end_time": row.end_time.isoformat(),
"end_bus_time": round_bus_time(row.start_time - app.bustime_start),
"end_bus_time": format_bustime(dt_to_bustime(app.bustime_start, row.end_time), "second"),
"verifier": row.verifier,
"speakers": row.names,
"text": row.highlighted_text if row.highlighted_text is not None else ""} for row in results])
def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None):
query = "SELECT *" + \
query = f"""
WITH q AS (
SELECT convert_query(%(text_query)s)
),
time_window AS (
SELECT id
FROM buscribe_transcriptions
WHERE start_time >= %(start_time)s
AND end_time <= %(end_time)s
),
relevant_lines AS (
(
",ts_headline(transcription_line, convert_query(%(text_query)s), 'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text" if ts_query is not None else ",transcription_line AS highlighted_text") + \
" FROM buscribe_all_transcriptions WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s "
if ts_query is not None:
query += "AND (coalesce(transcription_line_ts, ''::tsvector) || coalesce(names_ts, ''::tsvector)) @@ " \
"convert_query(%(text_query)s) " \
"ORDER BY ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) || coalesce(names_ts, ''::tsvector), convert_query(%(text_query)s)) DESC, " \
"start_time "
else:
query += "ORDER BY start_time "
if limit is not None:
query += "LIMIT %(limit)s "
if offset is not None:
query += "OFFSET %(limit)s "
query += ";"
SELECT id
FROM buscribe_transcriptions
WHERE id IN (SELECT id FROM time_window)
{"AND to_tsvector('english', transcription_line) @@ (SELECT * FROM q)" if ts_query else ""}
)
UNION
(
SELECT line
FROM buscribe_verified_lines
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector('english', verified_line) @@ (SELECT * FROM q)" if ts_query else ""}
)
UNION
(
SELECT line
FROM buscribe_line_speakers
INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""}
)
UNION
(
SELECT line
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""}
)
)
(
(SELECT id,
start_time,
end_time,
null AS verifier,
names,
transcription_line,
ts_rank_cd(coalesce(to_tsvector('english', transcription_line), ''::tsvector) ||
coalesce(to_tsvector(array_to_string(names, ' ')), ''::tsvector), (SELECT * FROM q)) AS rank,
ts_headline(transcription_line,
(SELECT * FROM q), 'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text,
transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN (SELECT line, array_agg(name) AS names
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers
ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
GROUP BY line) AS inferred_speakers ON id = inferred_speakers.line
WHERE id IN (SELECT id FROM relevant_lines)
)
UNION
(
SELECT buscribe_transcriptions.id AS id,
start_time,
end_time,
cverifier AS verifier,
names,
coalesce(verifications.verified_line,
buscribe_transcriptions.transcription_line) AS transcription_line,
ts_rank_cd(coalesce(
setweight(to_tsvector('english', verified_line), 'C'),
to_tsvector('english', buscribe_transcriptions.transcription_line),
''::tsvector) ||
coalesce(setweight(to_tsvector(array_to_string(names, ' ')), 'C'), ''::tsvector),
(SELECT * FROM q)) AS rank,
ts_headline(coalesce(verifications.verified_line, buscribe_transcriptions.transcription_line),
(SELECT * FROM q), 'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text,
null AS transcription_json
FROM buscribe_transcriptions
INNER JOIN (
SELECT *,
coalesce(relevant_verified.line, relevant_speakers.line) AS cline,
coalesce(relevant_verified.verifier, relevant_speakers.verifier) AS cverifier
FROM (SELECT *
FROM buscribe_verified_lines
WHERE line IN (SELECT id FROM relevant_lines)) AS relevant_verified
FULL OUTER JOIN
(SELECT line, verifier, array_agg(name) AS names
FROM buscribe_line_speakers
INNER JOIN buscribe_speakers
ON buscribe_line_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM relevant_lines)
GROUP BY line, verifier) AS relevant_speakers
ON relevant_verified.line = relevant_speakers.line AND
relevant_speakers.verifier = relevant_verified.verifier) AS verifications
ON id = verifications.cline
)
)
ORDER BY
{"rank DESC," if ts_query is not None else ""}
start_time
{"OFFSET %(offset)s" if offset is not None else ""}
{"LIMIT %(limit)s" if limit is not None else ""};
"""
return database.query(db_conn, query,
start_time=start_time if start_time is not None else '-infinity',

@ -37,7 +37,8 @@ def servelet(server):
logging.info('Starting WSGI server.')
server.serve_forever()
@argh.arg('channel',
help="Twitch channel to transcribe.")
@argh.arg('--host',
help='Address or socket server will listen to. Default is 0.0.0.0 (everything on the local machine).')
@argh.arg('--port',
@ -49,7 +50,7 @@ def servelet(server):
help='The start time in UTC for the event, for UTC-Bustime conversion')
@argh.arg('--base-dir',
help='Directory from which segments will be grabbed. Default is current working directory.')
def main(database="", host='0.0.0.0', port=8010, bustime_start=None, base_dir=None):
def main(channel, database="", host='0.0.0.0', port=8010, bustime_start=None, base_dir=None):
if bustime_start is None:
logging.error("Missing --bustime-start!")
exit(1)
@ -62,7 +63,7 @@ def main(database="", host='0.0.0.0', port=8010, bustime_start=None, base_dir=No
logging.error("Invalid --bustime-start!")
exit(1)
app.segments_dir = base_dir
app.segments_dir = os.path.join(base_dir, channel, "source")
app.db_manager = DBManager(dsn=database)

@ -6,6 +6,12 @@
margin-bottom: 1em;
div {
margin: 0;
padding: 0;
display: flex;
}
label {
display: inline-block;
font-family: @sans-serif;
@ -14,7 +20,7 @@
padding: 0.2em;
}
#text_search_line{
#text_search_line {
display: flex;
flex-direction: row;
@ -29,10 +35,18 @@
#time_search_line {
display: flex;
flex-direction: row;
flex-wrap: wrap;
input[type=datetime-local] {
div {
align-items: center;
}
input[type=datetime-local], input[type=text] {
width: 13em;
}
input[type=text] {
text-align: right;
}
#search_button {

@ -14,9 +14,21 @@
placeholder="Supports quotes, 'or' and -.">
</div>
<div id="time_search_line" class="form_line">
<label for="start_time">Start time</label> <input id="start_time" type="datetime-local">
<label for="end_time">End time</label> <input id="end_time" type="datetime-local">
<div><label for="start_time">Start time</label> <input id="start_time" type="datetime-local" autocomplete="off"></div>
<div><label for="end_time">End time</label> <input id="end_time" type="datetime-local" autocomplete="off"></div>
<div>
<label for="channel_select">Channel</label><select id="channel_select">
<option value="desertbus" selected>desertbus</option>
<option value="loadingreadyrun">loadingreadyrun</option>
</select>
</div>
<div>
<label>Time type</label>
<input type="radio" name="time_type" id="UTC_time_radio" oninput="switchToUTC()" checked autocomplete="off"><label for="UTC_time_radio">UTC Time</label>
<input type="radio" name="time_type" id="bus_time_radio" oninput="switchToBus()" autocomplete="off"><label for="bus_time_radio">Bus Time</label>
</div>
<button id="search_button" onclick="doSearch()" type="button">Search</button>
</div>
</div>

@ -60,6 +60,20 @@
grid-column: text;
}
.line_links {
text-align: right;
grid-column: times;
a {
margin-left: 0.5em;
font-size: small;
font-family: @sans-serif;
color: lightgray;
text-align: right;
}
}
}
.line.verified {

@ -11,11 +11,13 @@ function onSiteLoad(e) {
function query(text, start_time, end_time) {
let query_string = ""
const time_type = document.getElementById("UTC_time_radio").checked ? "" : "bus_";
if (start_time !== "") {
query_string += `start_time=${start_time}`;
query_string += `${time_type}start_time=${start_time}`;
}
if (end_time !== "") {
query_string += `&end_time=${end_time}`;
query_string += `&${time_type}end_time=${end_time}`;
}
if (text !== "") {
query_string += `&query=${text}`
@ -23,9 +25,10 @@ function query(text, start_time, end_time) {
query_string += "&limit=30";
fetch(`http://localhost:8010/buscribe/json?${query_string}`)
const channel = document.getElementById("channel_select").value;
fetch(`https://wubloader.raptorpond.com/buscribe/${channel}/json?${query_string}`)
.then(response => response.json())
// .then(response => console.log(response.error()))
.then(fillResults)
}
@ -42,6 +45,8 @@ function fillResults(results) {
const results_element = document.getElementById("results")
results_element.innerHTML = ""
const channel = document.getElementById("channel_select").value;
for (const line of results) {
const line_div = document.createElement("div");
@ -56,9 +61,42 @@ function fillResults(results) {
<div class="line_speakers">${line.speakers == null ? "" : line.speakers.join(", ")}</div>
<div class="line_start_time">${line.start_time}</div>
<div class="line_text">${line.text}</div>
<div class="line_links">
<a href="/professor/professor.html?line=${line.id}">Edit</a>
<a href="javascript:showContext('${line.start_time}');">Show context</a>
</div>
`;
results_element.append(line_div)
}
}
function switchToUTC() {
document.getElementById("start_time").type = "datetime-local";
document.getElementById("end_time").type = "datetime-local";
}
function switchToBus() {
document.getElementById("start_time").type = "text";
document.getElementById("end_time").type = "text";
}
function showContext(time) {
let start_time = new Date(time + "Z");
start_time.setMinutes(start_time.getMinutes() - 3);
let start_time_string = start_time.toISOString();
start_time_string = start_time_string.substring(0, start_time_string.length - 1)
let end_time = new Date(time + "Z");
end_time.setMinutes(end_time.getMinutes() + 3);
let end_time_string = end_time.toISOString();
end_time_string = end_time_string.substring(0, end_time_string.length - 1)
document.getElementById("start_time").value = start_time_string;
document.getElementById("end_time").value = end_time_string;
document.getElementById("search_text").value = "";
doSearch();
}

@ -1,9 +1,9 @@
FROM debian:latest
FROM debian:11
RUN apt update &&\
apt install -y python3 libpq-dev python3-pip curl unzip ffmpeg
COPY ../common /tmp/common
COPY common /tmp/common
RUN pip install /tmp/common && rm -r /tmp/common
COPY buscribe /tmp/buscribe

@ -84,7 +84,7 @@ def get_end_of_transcript(db_cursor):
"""Grab the end timestamp of the current transcript.
If there is no existing transcript returns default; used for cold starts."""
db_cursor.execute("SELECT end_time FROM buscribe.public.buscribe_transcriptions ORDER BY end_time DESC LIMIT 1")
db_cursor.execute("SELECT end_time FROM buscribe_transcriptions ORDER BY end_time DESC LIMIT 1")
end_of_transcript_row = db_cursor.fetchone()
return end_of_transcript_row.end_time if end_of_transcript_row is not None else None
@ -94,6 +94,7 @@ def finish_off_recognizer(recognizer: BuscribeRecognizer, db_cursor):
"""Flush the recognizer, commit the final line to the database and reset it."""
final_result_json = json.loads(recognizer.final_result()) # Flush the tubes
if "result" in final_result_json:
line_start_time = recognizer.segments_start_time + timedelta(seconds=final_result_json["result"][0]["start"])
line_end_time = recognizer.segments_start_time + timedelta(seconds=final_result_json["result"][-1]["end"])

@ -1,6 +1,6 @@
import logging
import os
from datetime import timedelta, datetime
from datetime import timedelta, datetime, timezone
from time import sleep
import argh
@ -27,13 +27,15 @@ from buscribe.recognizer import BuscribeRecognizer
help='Start time of the transcript. Buscript will try to start reading 2 min before this time, if available, '
'to prime the model. The transcripts for that time will not be written to the database. If not given '
'transcription will start after last already transcribed line.')
@argh.arg('--start-time-override',
help='Ignore database and force override the start time.')
@argh.arg('--end-time',
help='End of transcript. If not given continues to transcribe live.')
@argh.arg('--base-dir',
help='Directory from which segments will be grabbed. Default is current working directory.')
def main(channel, database="", base_dir=".",
model="/usr/share/buscribe/vosk-model-en-us-0.21/", spk_model="/usr/share/buscribe/vosk-model-spk-0.4/",
start_time=None, end_time=None):
start_time=None, end_time=None, start_time_override=None):
SAMPLE_RATE = 48000
segments_dir = os.path.join(base_dir, channel, "source")
@ -44,19 +46,27 @@ def main(channel, database="", base_dir=".",
logging.debug("Got database cursor.")
logging.info("Figuring out starting time...")
if start_time is not None:
db_start_time = get_end_of_transcript(db_cursor)
# ~~Database start time takes priority~~
# Overrride takes priority
if start_time_override is not None:
start_time = dateutil.parse(start_time_override)
elif db_start_time is not None:
start_time = db_start_time
elif start_time is not None:
start_time = dateutil.parse(start_time)
else:
start_time = get_end_of_transcript(db_cursor)
if end_time is not None:
end_time = dateutil.parse(end_time)
# No start time argument AND no end of transcript (empty database)
if start_time is None:
logging.error("Couldn't figure out start time!")
db_conn.close()
exit(1)
logging.info("Start time: {}".format(start_time))
if end_time is not None:
end_time = dateutil.parse(end_time)
logging.info("End time: {}".format(end_time))
logging.info("Loading models...")
recognizer = BuscribeRecognizer(SAMPLE_RATE, model, spk_model)
@ -77,15 +87,34 @@ def main(channel, database="", base_dir=".",
gevent.signal_handler(signal.SIGTERM, stop)
while True:
while start_time < end_time:
# If end time isn't given, use current time (plus fudge) to get a "live" segment list
segments = common.get_best_segments(segments_dir,
start_time,
end_time if end_time is not None else datetime.now() + timedelta(minutes=2))
# Remove initial None segment if it exists
end_time if end_time is not None else
datetime.utcnow() + timedelta(minutes=2))
# If there is a hole at the start of the requested range because
if segments[0] is None:
# The hole is older than a minute, therefore
# - reset recognizer
# - continue from existing segments
if datetime.utcnow() - start_time > timedelta(minutes=1):
finish_off_recognizer(recognizer, db_cursor)
# If the hole is less than a minute old, or if we don't have new segments: wait for segments
if datetime.utcnow() - start_time <= timedelta(minutes=1) or \
segments == [None]:
logging.info("Waiting for new or backfilled segments.")
sleep(30)
continue # Retry
# Remove initial None segment (indicating segments start time is after desired start time) if it exists
if segments[0] is None:
segments = segments[1:]
# Recognizer is fresh or was reset
if recognizer.segments_start_time is None:
recognizer.segments_start_time = segments[0].start
logging.info(f"Starting from: {segments[0].start}")
@ -99,14 +128,5 @@ def main(channel, database="", base_dir=".",
finish_off_recognizer(recognizer, db_cursor)
db_conn.close()
exit(0)
elif datetime.now() - segments_end_time > timedelta(minutes=5):
# Last seen segment ended more than five minutes ago. We hit a gap that will likely stay unfilled.
# Reset and jump to the other end of the gap.
finish_off_recognizer(recognizer, db_cursor)
else:
# End of live segment or a gap that is not old and might get filled.
# Give it a bit of time and continue.
# Note: if the gap is not filled within 30s, we jump to the next available segment.
sleep(30)
start_time = segments_end_time

@ -7,8 +7,10 @@ setup(
install_requires = [
"argh",
"psycopg2",
"gevent==1.5a2",
"greenlet==0.4.16",
#"gevent==1.5a2",
"gevent",
#"greenlet==0.4.16",
"greenlet",
"psycogreen",
"wubloader-common",
"python-dateutil",

@ -49,9 +49,9 @@ CREATE TABLE buscribe_verifiers
);
-- For testing
INSERT INTO buscribe_verifiers(email, name)
VALUES ('placeholder@example.com', 'Place Holder'),
('aguy@example.com', 'Arnold Guyana');
-- INSERT INTO buscribe_verifiers(email, name)
-- VALUES ('placeholder@example.com', 'Place Holder'),
-- ('aguy@example.com', 'Arnold Guyana');
CREATE TABLE buscribe_line_speakers
(
@ -62,6 +62,13 @@ CREATE TABLE buscribe_line_speakers
PRIMARY KEY (line, speaker, verifier)
);
CREATE TABLE buscribe_line_inferred_speakers
(
line BIGINT NOT NULL REFERENCES buscribe_transcriptions,
speaker BIGINT NOT NULL REFERENCES buscribe_speakers,
PRIMARY KEY (line, speaker)
);
CREATE TABLE buscribe_verified_lines
(
-- id BIGSERIAL PRIMARY KEY,
@ -85,8 +92,9 @@ SELECT buscribe_transcriptions.id,
end_time,
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
names,
verified_line AS transcription_line,
setweight(to_tsvector('english', verified_line), 'C') AS transcription_line_ts,
coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line,
coalesce(setweight(to_tsvector('english', verified_line), 'C'),
to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts,
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
null AS transcription_json
FROM buscribe_transcriptions
@ -106,15 +114,66 @@ SELECT id,
start_time,
end_time,
null AS verifier,
null AS names,
names,
transcription_line,
to_tsvector('english', transcription_line) AS transcription_line_ts,
null AS names_ts,
transcription_json
FROM buscribe_transcriptions;
FROM buscribe_transcriptions
LEFT OUTER JOIN (
SELECT line, array_agg(name) AS names
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
GROUP BY line
) AS speakers ON id = speakers.line;
ROLLBACK;
CREATE VIEW buscribe_all_transcriptions2 AS
SELECT buscribe_transcriptions.id,
start_time,
end_time,
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
names,
coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line,
to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts,
setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts,
coalesce(setweight(to_tsvector('english', verified_line), 'C'),
to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts,
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
null AS transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN buscribe_verified_lines ON buscribe_transcriptions.id = buscribe_verified_lines.line
LEFT OUTER JOIN (
SELECT line, verifier, array_agg(name) AS names
FROM buscribe_line_speakers
INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id
GROUP BY line, verifier
) AS speakers ON buscribe_transcriptions.id = speakers.line AND (
speakers.verifier = buscribe_verified_lines.verifier OR
buscribe_verified_lines.verifier IS NULL
)
WHERE coalesce(buscribe_verified_lines.verifier, speakers.verifier) IS NOT NULL
UNION
SELECT id,
start_time,
end_time,
null AS verifier,
names,
transcription_line,
to_tsvector('english', transcription_line) AS machine_line_ts,
null AS verified_line_ts,
to_tsvector('english', transcription_line) AS transcription_line_ts,
null AS names_ts,
transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN (
SELECT line, array_agg(name) AS names
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
GROUP BY line
) AS speakers ON id = speakers.line;
-- Convert last lexeme in a query to prefix query.
CREATE FUNCTION convert_query(query_text text) RETURNS tsquery AS
$$

@ -0,0 +1,130 @@
version: "3"
services:
buscribenginx:
image: buscribe-web:0.0.0
ports:
- "8020:80"
volumes:
- /srv/wubloader/segments:/usr/share/nginx/html/segments
networks:
- default
- wubloader_default
- traefik_network
labels:
- "traefik.docker.network=traefik_network"
- "traefik.http.routers.buscribe-router.rule=Host(`wubloader.raptorpond.com`)"
- "traefik.http.routers.buscribe-redirect.rule=Host(`wubloader.raptorpond.com`)"
- "traefik.http.routers.buscribe-redirect.entrypoints=web"
- "traefik.http.routers.buscribe-router.tls=true"
- "traefik.http.routers.buscribe-router.tls.certresolver=leresolver"
- "traefik.http.middlewares.buscribe-redirectscheme.redirectscheme.scheme=https"
- "traefik.http.middlewares.buscribe-redirectscheme.redirectscheme.permanent=true"
- "traefik.http.routers.buscribe-redirect.middlewares=buscribe-redirectscheme@docker"
restart: "on-failure"
# buscribelrr:
# image: buscribe:0.0.0
# command: [ "loadingreadyrun",
# "--start-time=2022-11-11T12:00:00Z",
# "--end-time=2022-11-20T22:00:00Z",
# "--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_lrr",
# "--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
# volumes:
# - /srv/wubloader/segments:/mnt
# buscribedb:
# image: buscribe:0.0.0
# command: [ "desertbus",
# "--start-time=2023-11-10T12:00:00Z",
# "--end-time=2023-11-15T00:00:00Z",
# "--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
# "--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
# volumes:
# - /srv/wubloader/segments:/mnt
buscribedb0:
image: buscribe:0.0.0
command: [ "desertbus",
"--start-time-override=2023-11-19T00:00:00Z",
"--end-time=2023-11-19T06:00:00Z",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
volumes:
- /srv/wubloader/segments:/mnt
buscribedb1:
image: buscribe:0.0.0
command: [ "desertbus",
"--start-time-override=2023-11-18T06:00:00Z",
"--end-time=2023-11-18T12:00:00Z",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
volumes:
- /srv/wubloader/segments:/mnt
buscribedb2:
image: buscribe:0.0.0
command: [ "desertbus",
"--start-time-override=2023-11-18T12:00:00Z",
"--end-time=2023-11-18T18:00:00Z",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
volumes:
- /srv/wubloader/segments:/mnt
buscribedb3:
image: buscribe:0.0.0
command: [ "desertbus",
"--start-time-override=2023-11-18T18:00:00Z",
"--end-time=2023-11-19T00:00:00Z",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
volumes:
- /srv/wubloader/segments:/mnt
# buscribeapilrr:
# image: buscribe-api:0.0.0
# command: [
# "loadingreadyrun",
# "--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_lrr",
# "--bustime-start=2023-11-11T22:00:00Z" ]
buscribeapidb:
image: buscribe-api:0.0.0
command: [
"desertbus",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--bustime-start=2023-11-11T22:00:00Z" ]
volumes:
- /srv/wubloader/segments:/mnt
professorapidb:
image: professor-api:0.0.0
command: [
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--bustime-start=2023-11-11T22:00:00Z" ]
postgres:
image: postgres:13
ports:
- "7654:5432"
environment:
- POSTGRES_USER=vst
- POSTGRES_DB=postgres
- POSTGRES_PASSWORD=flnMSYPRf
volumes:
- /srv/buscribe/postgres:/var/lib/postgresql/data
restart: "unless-stopped"
postgres-prometheus:
image: quay.io/prometheuscommunity/postgres-exporter
ports:
- "9187:9187"
environment:
- DATA_SOURCE_NAME=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_lrr?sslmode=disable
networks:
wubloader_default:
external: true
traefik_network:
external: true

@ -0,0 +1,5 @@
FROM node:17-alpine
RUN npm install less -g
ENTRYPOINT ["lessc"]

@ -0,0 +1,5 @@
FROM nginx:latest
COPY buscribe-web /usr/share/nginx/html/buscribe
COPY professor /usr/share/nginx/html/professor
COPY nginx/nginx.conf /etc/nginx/nginx.conf

@ -0,0 +1,57 @@
user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log notice;
pid /var/run/nginx.pid;
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
access_log /var/log/nginx/access.log main;
sendfile on;
#tcp_nopush on;
keepalive_timeout 65;
gzip on;
gzip_comp_level 9;
absolute_redirect off;
server {
listen 80;
server_name localhost;
#access_log /var/log/nginx/host.access.log main;
location / { proxy_pass http://nginx; }
location /buscribelrr {
alias /usr/share/nginx/html/buscribe;
}
location /buscribe {
alias /usr/share/nginx/html/buscribe;
}
location /professor {
alias /usr/share/nginx/html/professor;
}
#location /buscribe/loadingreadyrun/json { proxy_pass http://buscribeapilrr:8010/buscribe/json; }
location /buscribe/desertbus/json { proxy_pass http://buscribeapidb:8010/buscribe/json; }
location /professor/desertbus { proxy_pass http://professorapidb:8011/professor; }
}
}

@ -13,10 +13,10 @@ from professor_api.professor_api import app
def cors(app):
"""WSGI middleware that sets CORS headers"""
HEADERS = [
("Access-Control-Allow-Credentials", "false"),
("Access-Control-Allow-Headers", "*"),
("Access-Control-Allow-Credentials", "true"),
("Access-Control-Allow-Headers", "content-type"),
("Access-Control-Allow-Methods", "GET,HEAD,POST,PUT"),
("Access-Control-Allow-Origin", "*"),
("Access-Control-Allow-Origin", "http://localhost:63342,https://wubloader.raptorpond.com"),
("Access-Control-Expose-Headers", "*"),
("Access-Control-Max-Age", "86400"),
]
@ -45,7 +45,7 @@ def servelet(server):
'postgresql://USER:PASSWORD@HOST/DBNAME?KEY=VALUE')
@argh.arg('--bustime-start',
help='The start time in UTC for the event, for UTC-Bustime conversion')
def main(database="", host='0.0.0.0', port=8005, bustime_start=None):
def main(database="", host='0.0.0.0', port=8011, bustime_start=None):
if bustime_start is None:
logging.error("Missing --bustime-start!")
exit(1)

@ -1,5 +1,7 @@
import re
import urllib.parse
from functools import wraps
from random import randrange
import flask
import gevent
@ -8,9 +10,51 @@ from flask import jsonify, request, copy_current_request_context
from gevent import sleep
from psycopg2.extras import execute_values
from google.oauth2 import id_token
from google.auth.transport import requests
app = flask.Flask('buscribe')
def authenticate(f):
"""Authenticate a token against the database.
Reference: https://developers.google.com/identity/sign-in/web/backend-auth
https://developers.google.com/identity/gsi/web/guides/verify-google-id-token#using-a-google-api-client-library"""
@wraps(f)
def auth_wrapper(*args, **kwargs):
try:
user_token = request.cookies.get("credentials")
print(user_token)
except (KeyError, TypeError):
return 'User token required', 401
try:
idinfo = id_token.verify_oauth2_token(user_token, requests.Request(),
"164084252563-kaks3no7muqb82suvbubg7r0o87aip7n.apps.googleusercontent.com")
if idinfo['iss'] not in ['accounts.google.com', 'https://accounts.google.com']:
raise ValueError('Wrong issuer.')
except ValueError:
return 'Invalid token. Access denied.', 403
# check whether user is in the database
email = idinfo['email'].lower()
conn = app.db_manager.get_conn()
results = database.query(conn, """
SELECT email
FROM buscribe_verifiers
WHERE lower(email) = %s""", email)
row = results.fetchone()
if row is None:
return 'Unknown user. Access denied.', 403
return f(*args, editor=email, **kwargs)
return auth_wrapper
@app.route('/professor/line/<int:line_id>', methods=["GET"])
def get_line(line_id):
db_conn = app.db_manager.get_conn()
@ -20,7 +64,27 @@ def get_line(line_id):
if line is None:
return "Line not found.", 404
else:
return {"start_time": line.start_time.isoformat(),
return {"id": line.id,
"start_time": line.start_time.isoformat(),
"end_time": line.end_time.isoformat(),
"line_data": line.transcription_json}
@app.route('/professor/line/random', methods=["GET"])
def get_random_line():
db_conn = app.db_manager.get_conn()
n_lines = database.query(db_conn, "SELECT count(*) AS n_lines FROM buscribe_transcriptions;").fetchone().n_lines
row = randrange(n_lines)
line = database.query(db_conn, "SELECT * FROM buscribe_transcriptions OFFSET %(row)s LIMIT 1;", row=row).fetchone()
if line is None:
return "Line not found.", 404
else:
return {"id": line.id,
"start_time": line.start_time.isoformat(),
"end_time": line.end_time.isoformat(),
"line_data": line.transcription_json}
@ -42,12 +106,13 @@ def get_playlist(line_id):
#EXT-X-TARGETDURATION:{duration.total_seconds()}
#EXT-X-PROGRAM-DATE-TIME:{start_time_iso}
#EXTINF:{duration.total_seconds()}
//localhost/cut/desertbus/source.ts?start={urllib.parse.quote_plus(start_time_iso)}&end={urllib.parse.quote_plus(end_time_iso)}&type=rough&allow_holes=true
/cut/desertbus/source.ts?start={urllib.parse.quote_plus(start_time_iso)}&end={urllib.parse.quote_plus(end_time_iso)}&type=rough&allow_holes=true
#EXT-X-ENDLIST"""
@app.route('/professor/line/<int:line_id>', methods=["POST"])
def update_line(line_id):
@authenticate
def update_line(line_id, editor):
db_conn = app.db_manager.get_conn()
if "speakers" in request.json and \
@ -56,11 +121,11 @@ def update_line(line_id):
# Simpler than dealing with uniqueness
database.query(db_conn,
"DELETE FROM buscribe_line_speakers WHERE line = %(line_id)s AND verifier = %(verifier)s;",
line_id=line_id, verifier="placeholder@example.com")
line_id=line_id, verifier=editor)
execute_values(db_conn.cursor(),
"INSERT INTO buscribe_line_speakers(line, speaker, verifier) "
"VALUES %s;",
[(line_id, speaker, "placeholder@example.com") for speaker in
[(line_id, speaker, editor) for speaker in
request.json["speakers"]])
if "transcription" in request.json and \
isinstance(request.json["transcription"], str) and \
@ -70,11 +135,11 @@ def update_line(line_id):
database.query(db_conn,
"DELETE FROM buscribe_verified_lines WHERE line = %(line_id)s AND verifier = %(verifier)s;",
line_id=line_id, verifier="placeholder@example.com")
line_id=line_id, verifier=editor)
database.query(db_conn,
"INSERT INTO buscribe_verified_lines(line, verified_line, verifier) "
"VALUES (%(line)s, %(verified_line)s, %(verifier)s)",
line=line_id, verified_line=verified_line, verifier="placeholder@example.com")
line=line_id, verified_line=verified_line, verifier=editor)
return "", 204
@ -101,7 +166,8 @@ def get_speaker(speaker_id):
@app.route('/professor/speaker', methods=["PUT"])
def new_speaker():
@authenticate
def new_speaker(editor=None):
name = request.json
if not isinstance(name, str):

@ -11,6 +11,7 @@ setup(
"psycogreen",
"wubloader-common",
"python-dateutil",
"flask"
"flask",
"google-auth"
],
)

File diff suppressed because one or more lines are too long

@ -5,7 +5,6 @@
<title>Buscribe -- Professor</title>
<link href="video.js/dist/video-js.min.css" rel="stylesheet">
<!-- <link href="videojs-hls-quality-selector/dist/videojs-hls-quality-selector.css" rel="stylesheet">-->
<link href="jquery-ui-1.13.0.custom/jquery-ui.css" rel="stylesheet">
<link href="style.css" rel="stylesheet">
@ -13,11 +12,9 @@
<script src="jquery-ui-1.13.0.custom/external/jquery/jquery.js"></script>
<script src="jquery-ui-1.13.0.custom/jquery-ui.js"></script>
<script src="script.js"></script>
<script src="hotkeys.min.js"></script>
<!-- <script src="videojs-contrib-quality-levels/dist/videojs-contrib-quality-levels.min.js"></script>-->
<!-- <script src="videojs-hls-quality-selector/dist/videojs-hls-quality-selector.min.js"></script>-->
<script src="script.js"></script>
</head>
<body onload="pageReady()">
@ -46,7 +43,13 @@
<button id="submit_button" onclick="submit()" type="button">Submit</button><span id="update_indicator"></span>
<script src="video.js/dist/video.min.js"></script>
<div id="googleLoginButton" style="display: none"></div>
<div id="logout" style="display: none"><a href="javascript:doLogout()">Log out</a></div>
<script src="video.js/dist/video.min.js"></script>
<script src="https://accounts.google.com/gsi/client" async defer></script>
<script>
window.onGoogleLibraryLoad = doGoogle
</script>
</body>
</html>

@ -1,7 +1,13 @@
function pageReady() {
const params = new URLSearchParams(document.location.search.substring(1));
let line_id;
if (params.get("line") !== "random") {
line_id = parseInt(params.get("line"), 10);
} else {
line_id = "random"
}
videojs("player", {
// src: "test.m3u8",
@ -27,15 +33,61 @@ function pageReady() {
const bgOpacitySelector = document.querySelector('.vjs-bg-opacity > select');
bgOpacitySelector.value = "0.5"
fetch(`//localhost:8005/professor/line/${line_id}`)
fetch(`/professor/desertbus/line/${line_id}`)
.then(response => response.json())
.then(fillLineInfo)
.then(initializePlayer);
handleLoginState();
}
hotkeys('ctrl+enter', function (event, handler){
document.getElementById("submit_button").click();
});
function handleLoginState() {
if (document.cookie.split('; ').find(row => row.startsWith('credentials='))) {
document.getElementById("logout").style.display = "";
} else {
document.getElementById("googleLoginButton").style.display = "";
}
}
function doGoogle() {
google.accounts.id.initialize({
client_id: "164084252563-kaks3no7muqb82suvbubg7r0o87aip7n.apps.googleusercontent.com",
callback: loggedIn,
auto_select: true
});
google.accounts.id.renderButton(
document.getElementById("googleLoginButton"),
{theme: "outline", size: "large"} // customization attributes
);
google.accounts.id.prompt(); // also display the One Tap dialog
}
function doLogout() {
document.cookie = `credentials=;expires=Thu, 01 Jan 1970 00:00:01 GMT`;
document.getElementById("googleLoginButton").style.display = "";
document.getElementById("logout").style.display = "none";
}
function loggedIn(response) {
document.cookie = `credentials=${response.credential}`;
document.getElementById("googleLoginButton").style.display = "none";
document.getElementById("logout").style.display = "";
console.log(response);
}
function fillLineInfo(line_json) {
// document.getElementById("original_transcription").innerText = line_json.line_data.text;
line_id = line_json.id
line = line_json
document.getElementById("original_transcription").innerHTML = line_json.line_data.result
.map(word => `<span style="opacity: ${word.conf}">${word.word}</span>`).join(" ");
@ -45,11 +97,12 @@ function fillLineInfo(line_json) {
function initializePlayer() {
videojs.getPlayer("player").src([
{src: `//localhost:8005/professor/line/${line_id}/playlist.m3u8`}
//{src: `/professor/desertbus/line/${line_id}/playlist.m3u8`}
{src: `/playlist/desertbus/source.m3u8?start=${line.start_time}&end=${line.end_time}`}
]);
videojs.getPlayer("player").addRemoteTextTrack({
kind: "captions",
src: `//localhost:8010/buscribe/vtt?start_time=${line.start_time}&end_time=${line.end_time}`,
src: `/buscribe/desertbus/vtt?start_time=${line.start_time}&end_time=${line.end_time}`,
srclang: "en",
label: "English",
default: true
@ -73,26 +126,28 @@ async function submit() {
}
}
return await fetch("//localhost:8005/professor/speaker",
return await fetch("/professor/desertbus/speaker",
{
method: "PUT",
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(speaker)
body: JSON.stringify(speaker),
credentials: "include"
}).then(response =>
parseInt(response.headers.get("Content-Location")
.split("/")
.pop(), 10));
}));
fetch(`//localhost:8005/professor/line/${line_id}`,
fetch(`/professor/desertbus/line/${line_id}`,
{
method: "POST",
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({transcription: new_transcription, speakers: new_speakers})
body: JSON.stringify({transcription: new_transcription, speakers: new_speakers}),
credentials: "include"
}).then(response => {
if (response.ok) {
document.getElementById("update_indicator").innerText = "\u2714\ufe0f"
@ -103,7 +158,7 @@ async function submit() {
}
$(function () {
fetch("//localhost:8005/professor/speaker")
fetch("/professor/desertbus/speaker")
.then(response => response.json())
.then(function (speakers_json) {
speakers = speakers_json;
@ -153,3 +208,16 @@ $(function () {
)
});
function parseJwt(token) {
const base64Url = token.split('.')[1];
const base64 = base64Url.replace(/-/g, '+').replace(/_/g, '/');
const jsonPayload = decodeURIComponent(
atob(base64)
.split('')
.map(function (c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
return JSON.parse(jsonPayload);
}

@ -61,3 +61,10 @@ button {
span.verified_cc {
color: #c1ffc1;
}
#logout {
padding: 0.1em;
a {
color: darkgray
}
}

@ -0,0 +1,12 @@
#!/bin/bash
docker run \
--rm \
-v /srv/wubloader/segments/:/mnt/ \
buscribe:0.0.0 \
loadingreadyrun \
--start-time='2021-11-05T00:00' \
--end-time='2021-11-07T00:00' \
--database=postgresql://vst:flnMSYPRf@mula.lan:6543/buscribe_lrr \
--model=/usr/share/buscribe/vosk-model-en-us-0.22/
# --model=/usr/share/buscribe/vosk-model-small-en-us-0.15/
Loading…
Cancel
Save