Compare commits

..

No commits in common. 'trunk' and 'multichannel' have entirely different histories.

@ -1 +0,0 @@
models/

@ -1,15 +0,0 @@
#!/bin/bash
VERSION=0.0.0
#bash fetch_models.sh
docker build -f buscribe/Dockerfile -t buscribe:$VERSION .
docker build -f buscribe-api/Dockerfile -t buscribe-api:$VERSION .
docker build -f professor-api/Dockerfile -t professor-api:$VERSION .
docker build -f docker-less/Dockerfile -t lessc .
docker run --rm -v "$(pwd)"/buscribe-web:/buscribe-web lessc /buscribe-web/style.less > buscribe-web/style.css
docker run --rm -v "$(pwd)"/professor:/professor lessc /professor/style.less > professor/style.css
docker build -f nginx/Dockerfile -t buscribe-web:$VERSION .

@ -13,7 +13,6 @@ RUN pip install /tmp/common && rm -r /tmp/common
# Install actual application
RUN apk add postgresql-dev postgresql-libs
COPY buscribe-api /tmp/buscribe-api
RUN pip install /tmp/buscribe-api && cp -r /tmp/buscribe-api/templates /templates \
&& rm -r /tmp/buscribe-api
RUN pip install /tmp/buscribe-api && rm -r /tmp/buscribe-api
ENTRYPOINT ["python3", "-m", "buscribeapi", "--base-dir", "/mnt"]
ENTRYPOINT ["python3", "-m", "buscribeapi"]

@ -1,8 +1,9 @@
import json
from datetime import timedelta
import common
import flask as flask
from common import dateutil, database, format_bustime, dt_to_bustime, bustime_to_dt, parse_bustime
import common
from common import dateutil, database
from dateutil.parser import ParserError
from flask import request, jsonify, Response, render_template
@ -21,6 +22,11 @@ def create_seconds_timedelta(seconds):
return timedelta(seconds=seconds)
def round_bus_time(delta: timedelta):
"""Round bus time down to the second."""
return f'{delta.days * 24 + delta.seconds // 3600:02}:{(delta.seconds % 3600) // 60:02}:{delta.seconds % 60:02}'
@app.route('/buscribe/vtt')
def get_vtt():
"""Returns WebVTT subtitle file for the period between start_time and end_time.
@ -68,32 +74,20 @@ def get_json():
(https://www.postgresql.org/docs/13/functions-textsearch.html)"""
start_time_string = request.args.get('start_time')
bus_start_time_string = request.args.get('bus_start_time')
if start_time_string is not None:
try:
start_time = dateutil.parse(start_time_string)
except ParserError:
return "Invalid start time!", 400
elif bus_start_time_string is not None:
try:
start_time = bustime_to_dt(app.bustime_start, parse_bustime(bus_start_time_string))
except ValueError:
return "Invalid bus end time!", 400
else:
start_time = None
end_time_string = request.args.get('end_time')
bus_end_time_string = request.args.get('bus_end_time')
if end_time_string is not None:
try:
end_time = dateutil.parse(end_time_string)
except ParserError:
return "Invalid end time!", 400
elif bus_end_time_string is not None:
try:
end_time = bustime_to_dt(app.bustime_start, parse_bustime(bus_end_time_string))
except ValueError:
return "Invalid bus end time!", 400
else:
end_time = None
@ -109,120 +103,35 @@ def get_json():
return jsonify([{"id": row.id,
"start_time": row.start_time.isoformat(),
"start_bus_time": format_bustime(dt_to_bustime(app.bustime_start, row.start_time), "second"),
"start_bus_time": round_bus_time(row.start_time - app.bustime_start),
"end_time": row.end_time.isoformat(),
"end_bus_time": format_bustime(dt_to_bustime(app.bustime_start, row.end_time), "second"),
"end_bus_time": round_bus_time(row.start_time - app.bustime_start),
"verifier": row.verifier,
"speakers": row.names,
"text": row.highlighted_text if row.highlighted_text is not None else ""} for row in results])
def fetch_lines(db_conn, start_time, end_time, ts_query=None, limit=None, offset=None):
query = f"""
WITH q AS (
SELECT convert_query(%(text_query)s)
),
time_window AS (
SELECT id
FROM buscribe_transcriptions
WHERE start_time >= %(start_time)s
AND end_time <= %(end_time)s
),
relevant_lines AS (
(
SELECT id
FROM buscribe_transcriptions
WHERE id IN (SELECT id FROM time_window)
{"AND to_tsvector('english', transcription_line) @@ (SELECT * FROM q)" if ts_query else ""}
)
UNION
(
SELECT line
FROM buscribe_verified_lines
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector('english', verified_line) @@ (SELECT * FROM q)" if ts_query else ""}
)
UNION
(
SELECT line
FROM buscribe_line_speakers
INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""}
)
UNION
(
SELECT line
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM time_window)
{"AND to_tsvector(name) @@ (SELECT * FROM q)" if ts_query else ""}
)
)
(
(SELECT id,
start_time,
end_time,
null AS verifier,
names,
transcription_line,
ts_rank_cd(coalesce(to_tsvector('english', transcription_line), ''::tsvector) ||
coalesce(to_tsvector(array_to_string(names, ' ')), ''::tsvector), (SELECT * FROM q)) AS rank,
ts_headline(transcription_line,
(SELECT * FROM q), 'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text,
transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN (SELECT line, array_agg(name) AS names
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers
ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
GROUP BY line) AS inferred_speakers ON id = inferred_speakers.line
WHERE id IN (SELECT id FROM relevant_lines)
)
UNION
(
SELECT buscribe_transcriptions.id AS id,
start_time,
end_time,
cverifier AS verifier,
names,
coalesce(verifications.verified_line,
buscribe_transcriptions.transcription_line) AS transcription_line,
ts_rank_cd(coalesce(
setweight(to_tsvector('english', verified_line), 'C'),
to_tsvector('english', buscribe_transcriptions.transcription_line),
''::tsvector) ||
coalesce(setweight(to_tsvector(array_to_string(names, ' ')), 'C'), ''::tsvector),
(SELECT * FROM q)) AS rank,
ts_headline(coalesce(verifications.verified_line, buscribe_transcriptions.transcription_line),
(SELECT * FROM q), 'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text,
null AS transcription_json
FROM buscribe_transcriptions
INNER JOIN (
SELECT *,
coalesce(relevant_verified.line, relevant_speakers.line) AS cline,
coalesce(relevant_verified.verifier, relevant_speakers.verifier) AS cverifier
FROM (SELECT *
FROM buscribe_verified_lines
WHERE line IN (SELECT id FROM relevant_lines)) AS relevant_verified
FULL OUTER JOIN
(SELECT line, verifier, array_agg(name) AS names
FROM buscribe_line_speakers
INNER JOIN buscribe_speakers
ON buscribe_line_speakers.speaker = buscribe_speakers.id
WHERE line IN (SELECT id FROM relevant_lines)
GROUP BY line, verifier) AS relevant_speakers
ON relevant_verified.line = relevant_speakers.line AND
relevant_speakers.verifier = relevant_verified.verifier) AS verifications
ON id = verifications.cline
)
)
ORDER BY
{"rank DESC," if ts_query is not None else ""}
start_time
{"OFFSET %(offset)s" if offset is not None else ""}
{"LIMIT %(limit)s" if limit is not None else ""};
"""
query = "SELECT *" + \
(
",ts_headline(transcription_line, convert_query(%(text_query)s), 'StartSel=''<span class=\"highlight\">'', StopSel=</span>') AS highlighted_text" if ts_query is not None else ",transcription_line AS highlighted_text") + \
" FROM buscribe_all_transcriptions WHERE start_time >= %(start_time)s AND end_time <= %(end_time)s "
if ts_query is not None:
query += "AND (coalesce(transcription_line_ts, ''::tsvector) || coalesce(names_ts, ''::tsvector)) @@ " \
"convert_query(%(text_query)s) " \
"ORDER BY ts_rank_cd(coalesce(transcription_line_ts, ''::tsvector) || coalesce(names_ts, ''::tsvector), convert_query(%(text_query)s)) DESC, " \
"start_time "
else:
query += "ORDER BY start_time "
if limit is not None:
query += "LIMIT %(limit)s "
if offset is not None:
query += "OFFSET %(limit)s "
query += ";"
return database.query(db_conn, query,
start_time=start_time if start_time is not None else '-infinity',

@ -37,8 +37,7 @@ def servelet(server):
logging.info('Starting WSGI server.')
server.serve_forever()
@argh.arg('channel',
help="Twitch channel to transcribe.")
@argh.arg('--host',
help='Address or socket server will listen to. Default is 0.0.0.0 (everything on the local machine).')
@argh.arg('--port',
@ -50,7 +49,7 @@ def servelet(server):
help='The start time in UTC for the event, for UTC-Bustime conversion')
@argh.arg('--base-dir',
help='Directory from which segments will be grabbed. Default is current working directory.')
def main(channel, database="", host='0.0.0.0', port=8010, bustime_start=None, base_dir=None):
def main(database="", host='0.0.0.0', port=8010, bustime_start=None, base_dir=None):
if bustime_start is None:
logging.error("Missing --bustime-start!")
exit(1)
@ -63,7 +62,7 @@ def main(channel, database="", host='0.0.0.0', port=8010, bustime_start=None, ba
logging.error("Invalid --bustime-start!")
exit(1)
app.segments_dir = os.path.join(base_dir, channel, "source")
app.segments_dir = base_dir
app.db_manager = DBManager(dsn=database)

@ -6,12 +6,6 @@
margin-bottom: 1em;
div {
margin: 0;
padding: 0;
display: flex;
}
label {
display: inline-block;
font-family: @sans-serif;
@ -20,7 +14,7 @@
padding: 0.2em;
}
#text_search_line {
#text_search_line{
display: flex;
flex-direction: row;
@ -35,18 +29,10 @@
#time_search_line {
display: flex;
flex-direction: row;
flex-wrap: wrap;
div {
align-items: center;
}
input[type=datetime-local], input[type=text] {
input[type=datetime-local] {
width: 13em;
}
input[type=text] {
text-align: right;
}
#search_button {

@ -9,27 +9,15 @@
<body onload="onSiteLoad()">
<div id="search_tools">
<div id="text_search_line" class="form_line">
<label for="search_text">Search</label> <input type="search" id="search_text" oninput="doSearch()"
placeholder="Supports quotes, 'or' and -.">
</div>
<div id="time_search_line" class="form_line">
<div><label for="start_time">Start time</label> <input id="start_time" type="datetime-local" autocomplete="off"></div>
<div><label for="end_time">End time</label> <input id="end_time" type="datetime-local" autocomplete="off"></div>
<div>
<label for="channel_select">Channel</label><select id="channel_select">
<option value="desertbus" selected>desertbus</option>
<option value="loadingreadyrun">loadingreadyrun</option>
</select>
<div id="text_search_line" class="form_line">
<label for="search_text">Search</label> <input type="search" id="search_text" oninput="doSearch()"
placeholder="Supports quotes, 'or' and -.">
</div>
<div>
<label>Time type</label>
<input type="radio" name="time_type" id="UTC_time_radio" oninput="switchToUTC()" checked autocomplete="off"><label for="UTC_time_radio">UTC Time</label>
<input type="radio" name="time_type" id="bus_time_radio" oninput="switchToBus()" autocomplete="off"><label for="bus_time_radio">Bus Time</label>
<div id="time_search_line" class="form_line">
<label for="start_time">Start time</label> <input id="start_time" type="datetime-local">
<label for="end_time">End time</label> <input id="end_time" type="datetime-local">
<button id="search_button" onclick="doSearch()" type="button">Search</button>
</div>
<button id="search_button" onclick="doSearch()" type="button">Search</button>
</div>
</div>
<div id="results">

@ -60,20 +60,6 @@
grid-column: text;
}
.line_links {
text-align: right;
grid-column: times;
a {
margin-left: 0.5em;
font-size: small;
font-family: @sans-serif;
color: lightgray;
text-align: right;
}
}
}
.line.verified {

@ -11,13 +11,11 @@ function onSiteLoad(e) {
function query(text, start_time, end_time) {
let query_string = ""
const time_type = document.getElementById("UTC_time_radio").checked ? "" : "bus_";
if (start_time !== "") {
query_string += `${time_type}start_time=${start_time}`;
query_string += `start_time=${start_time}`;
}
if (end_time !== "") {
query_string += `&${time_type}end_time=${end_time}`;
query_string += `&end_time=${end_time}`;
}
if (text !== "") {
query_string += `&query=${text}`
@ -25,10 +23,9 @@ function query(text, start_time, end_time) {
query_string += "&limit=30";
const channel = document.getElementById("channel_select").value;
fetch(`https://wubloader.raptorpond.com/buscribe/${channel}/json?${query_string}`)
fetch(`http://localhost:8010/buscribe/json?${query_string}`)
.then(response => response.json())
// .then(response => console.log(response.error()))
.then(fillResults)
}
@ -45,8 +42,6 @@ function fillResults(results) {
const results_element = document.getElementById("results")
results_element.innerHTML = ""
const channel = document.getElementById("channel_select").value;
for (const line of results) {
const line_div = document.createElement("div");
@ -61,42 +56,9 @@ function fillResults(results) {
<div class="line_speakers">${line.speakers == null ? "" : line.speakers.join(", ")}</div>
<div class="line_start_time">${line.start_time}</div>
<div class="line_text">${line.text}</div>
<div class="line_links">
<a href="/professor/professor.html?line=${line.id}">Edit</a>
<a href="javascript:showContext('${line.start_time}');">Show context</a>
</div>
`;
results_element.append(line_div)
}
}
function switchToUTC() {
document.getElementById("start_time").type = "datetime-local";
document.getElementById("end_time").type = "datetime-local";
}
function switchToBus() {
document.getElementById("start_time").type = "text";
document.getElementById("end_time").type = "text";
}
function showContext(time) {
let start_time = new Date(time + "Z");
start_time.setMinutes(start_time.getMinutes() - 3);
let start_time_string = start_time.toISOString();
start_time_string = start_time_string.substring(0, start_time_string.length - 1)
let end_time = new Date(time + "Z");
end_time.setMinutes(end_time.getMinutes() + 3);
let end_time_string = end_time.toISOString();
end_time_string = end_time_string.substring(0, end_time_string.length - 1)
document.getElementById("start_time").value = start_time_string;
document.getElementById("end_time").value = end_time_string;
document.getElementById("search_text").value = "";
doSearch();
}

@ -1,9 +1,9 @@
FROM debian:11
FROM debian:latest
RUN apt update &&\
apt install -y python3 libpq-dev python3-pip curl unzip ffmpeg
COPY common /tmp/common
COPY ../common /tmp/common
RUN pip install /tmp/common && rm -r /tmp/common
COPY buscribe /tmp/buscribe

@ -84,7 +84,7 @@ def get_end_of_transcript(db_cursor):
"""Grab the end timestamp of the current transcript.
If there is no existing transcript returns default; used for cold starts."""
db_cursor.execute("SELECT end_time FROM buscribe_transcriptions ORDER BY end_time DESC LIMIT 1")
db_cursor.execute("SELECT end_time FROM buscribe.public.buscribe_transcriptions ORDER BY end_time DESC LIMIT 1")
end_of_transcript_row = db_cursor.fetchone()
return end_of_transcript_row.end_time if end_of_transcript_row is not None else None
@ -94,10 +94,9 @@ def finish_off_recognizer(recognizer: BuscribeRecognizer, db_cursor):
"""Flush the recognizer, commit the final line to the database and reset it."""
final_result_json = json.loads(recognizer.final_result()) # Flush the tubes
if "result" in final_result_json:
line_start_time = recognizer.segments_start_time + timedelta(seconds=final_result_json["result"][0]["start"])
line_end_time = recognizer.segments_start_time + timedelta(seconds=final_result_json["result"][-1]["end"])
line_start_time = recognizer.segments_start_time + timedelta(seconds=final_result_json["result"][0]["start"])
line_end_time = recognizer.segments_start_time + timedelta(seconds=final_result_json["result"][-1]["end"])
write_line(final_result_json, line_start_time, line_end_time, db_cursor)
write_line(final_result_json, line_start_time, line_end_time, db_cursor)
recognizer.reset()

@ -1,6 +1,6 @@
import logging
import os
from datetime import timedelta, datetime, timezone
from datetime import timedelta, datetime
from time import sleep
import argh
@ -27,15 +27,13 @@ from buscribe.recognizer import BuscribeRecognizer
help='Start time of the transcript. Buscript will try to start reading 2 min before this time, if available, '
'to prime the model. The transcripts for that time will not be written to the database. If not given '
'transcription will start after last already transcribed line.')
@argh.arg('--start-time-override',
help='Ignore database and force override the start time.')
@argh.arg('--end-time',
help='End of transcript. If not given continues to transcribe live.')
@argh.arg('--base-dir',
help='Directory from which segments will be grabbed. Default is current working directory.')
def main(channel, database="", base_dir=".",
model="/usr/share/buscribe/vosk-model-en-us-0.21/", spk_model="/usr/share/buscribe/vosk-model-spk-0.4/",
start_time=None, end_time=None, start_time_override=None):
start_time=None, end_time=None):
SAMPLE_RATE = 48000
segments_dir = os.path.join(base_dir, channel, "source")
@ -46,27 +44,19 @@ def main(channel, database="", base_dir=".",
logging.debug("Got database cursor.")
logging.info("Figuring out starting time...")
db_start_time = get_end_of_transcript(db_cursor)
# ~~Database start time takes priority~~
# Overrride takes priority
if start_time_override is not None:
start_time = dateutil.parse(start_time_override)
elif db_start_time is not None:
start_time = db_start_time
elif start_time is not None:
if start_time is not None:
start_time = dateutil.parse(start_time)
else:
# No start time argument AND no end of transcript (empty database)
logging.error("Couldn't figure out start time!")
db_conn.close()
exit(1)
logging.info("Start time: {}".format(start_time))
start_time = get_end_of_transcript(db_cursor)
if end_time is not None:
end_time = dateutil.parse(end_time)
logging.info("End time: {}".format(end_time))
# No start time argument AND no end of transcript (empty database)
if start_time is None:
logging.error("Couldn't figure out start time!")
db_conn.close()
exit(1)
logging.info("Loading models...")
recognizer = BuscribeRecognizer(SAMPLE_RATE, model, spk_model)
@ -87,34 +77,15 @@ def main(channel, database="", base_dir=".",
gevent.signal_handler(signal.SIGTERM, stop)
while start_time < end_time:
while True:
# If end time isn't given, use current time (plus fudge) to get a "live" segment list
segments = common.get_best_segments(segments_dir,
start_time,
end_time if end_time is not None else
datetime.utcnow() + timedelta(minutes=2))
# If there is a hole at the start of the requested range because
if segments[0] is None:
# The hole is older than a minute, therefore
# - reset recognizer
# - continue from existing segments
if datetime.utcnow() - start_time > timedelta(minutes=1):
finish_off_recognizer(recognizer, db_cursor)
# If the hole is less than a minute old, or if we don't have new segments: wait for segments
if datetime.utcnow() - start_time <= timedelta(minutes=1) or \
segments == [None]:
logging.info("Waiting for new or backfilled segments.")
sleep(30)
continue # Retry
# Remove initial None segment (indicating segments start time is after desired start time) if it exists
end_time if end_time is not None else datetime.now() + timedelta(minutes=2))
# Remove initial None segment if it exists
if segments[0] is None:
segments = segments[1:]
# Recognizer is fresh or was reset
if recognizer.segments_start_time is None:
recognizer.segments_start_time = segments[0].start
logging.info(f"Starting from: {segments[0].start}")
@ -128,5 +99,14 @@ def main(channel, database="", base_dir=".",
finish_off_recognizer(recognizer, db_cursor)
db_conn.close()
exit(0)
elif datetime.now() - segments_end_time > timedelta(minutes=5):
# Last seen segment ended more than five minutes ago. We hit a gap that will likely stay unfilled.
# Reset and jump to the other end of the gap.
finish_off_recognizer(recognizer, db_cursor)
else:
# End of live segment or a gap that is not old and might get filled.
# Give it a bit of time and continue.
# Note: if the gap is not filled within 30s, we jump to the next available segment.
sleep(30)
start_time = segments_end_time

@ -7,10 +7,8 @@ setup(
install_requires = [
"argh",
"psycopg2",
#"gevent==1.5a2",
"gevent",
#"greenlet==0.4.16",
"greenlet",
"gevent==1.5a2",
"greenlet==0.4.16",
"psycogreen",
"wubloader-common",
"python-dateutil",

@ -49,9 +49,9 @@ CREATE TABLE buscribe_verifiers
);
-- For testing
-- INSERT INTO buscribe_verifiers(email, name)
-- VALUES ('placeholder@example.com', 'Place Holder'),
-- ('aguy@example.com', 'Arnold Guyana');
INSERT INTO buscribe_verifiers(email, name)
VALUES ('placeholder@example.com', 'Place Holder'),
('aguy@example.com', 'Arnold Guyana');
CREATE TABLE buscribe_line_speakers
(
@ -62,13 +62,6 @@ CREATE TABLE buscribe_line_speakers
PRIMARY KEY (line, speaker, verifier)
);
CREATE TABLE buscribe_line_inferred_speakers
(
line BIGINT NOT NULL REFERENCES buscribe_transcriptions,
speaker BIGINT NOT NULL REFERENCES buscribe_speakers,
PRIMARY KEY (line, speaker)
);
CREATE TABLE buscribe_verified_lines
(
-- id BIGSERIAL PRIMARY KEY,
@ -90,13 +83,12 @@ CREATE VIEW buscribe_all_transcriptions AS
SELECT buscribe_transcriptions.id,
start_time,
end_time,
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
names,
coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line,
coalesce(setweight(to_tsvector('english', verified_line), 'C'),
to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts,
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
null AS transcription_json
verified_line AS transcription_line,
setweight(to_tsvector('english', verified_line), 'C') AS transcription_line_ts,
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
null AS transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN buscribe_verified_lines ON buscribe_transcriptions.id = buscribe_verified_lines.line
LEFT OUTER JOIN (
@ -114,66 +106,15 @@ SELECT id,
start_time,
end_time,
null AS verifier,
names,
null AS names,
transcription_line,
to_tsvector('english', transcription_line) AS transcription_line_ts,
null AS names_ts,
transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN (
SELECT line, array_agg(name) AS names
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
GROUP BY line
) AS speakers ON id = speakers.line;
FROM buscribe_transcriptions;
ROLLBACK;
CREATE VIEW buscribe_all_transcriptions2 AS
SELECT buscribe_transcriptions.id,
start_time,
end_time,
coalesce(buscribe_verified_lines.verifier, speakers.verifier) AS verifier,
names,
coalesce(verified_line, buscribe_transcriptions.transcription_line) AS transcription_line,
to_tsvector('english', buscribe_transcriptions.transcription_line) AS machine_line_ts,
setweight(to_tsvector('english', verified_line), 'C') AS verified_line_ts,
coalesce(setweight(to_tsvector('english', verified_line), 'C'),
to_tsvector('english', buscribe_transcriptions.transcription_line)) AS transcription_line_ts,
setweight(to_tsvector(array_to_string(names, ' ')), 'C') AS names_ts,
null AS transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN buscribe_verified_lines ON buscribe_transcriptions.id = buscribe_verified_lines.line
LEFT OUTER JOIN (
SELECT line, verifier, array_agg(name) AS names
FROM buscribe_line_speakers
INNER JOIN buscribe_speakers ON buscribe_line_speakers.speaker = buscribe_speakers.id
GROUP BY line, verifier
) AS speakers ON buscribe_transcriptions.id = speakers.line AND (
speakers.verifier = buscribe_verified_lines.verifier OR
buscribe_verified_lines.verifier IS NULL
)
WHERE coalesce(buscribe_verified_lines.verifier, speakers.verifier) IS NOT NULL
UNION
SELECT id,
start_time,
end_time,
null AS verifier,
names,
transcription_line,
to_tsvector('english', transcription_line) AS machine_line_ts,
null AS verified_line_ts,
to_tsvector('english', transcription_line) AS transcription_line_ts,
null AS names_ts,
transcription_json
FROM buscribe_transcriptions
LEFT OUTER JOIN (
SELECT line, array_agg(name) AS names
FROM buscribe_line_inferred_speakers
INNER JOIN buscribe_speakers ON buscribe_line_inferred_speakers.speaker = buscribe_speakers.id
GROUP BY line
) AS speakers ON id = speakers.line;
-- Convert last lexeme in a query to prefix query.
CREATE FUNCTION convert_query(query_text text) RETURNS tsquery AS
$$

@ -1,130 +0,0 @@
version: "3"
services:
buscribenginx:
image: buscribe-web:0.0.0
ports:
- "8020:80"
volumes:
- /srv/wubloader/segments:/usr/share/nginx/html/segments
networks:
- default
- wubloader_default
- traefik_network
labels:
- "traefik.docker.network=traefik_network"
- "traefik.http.routers.buscribe-router.rule=Host(`wubloader.raptorpond.com`)"
- "traefik.http.routers.buscribe-redirect.rule=Host(`wubloader.raptorpond.com`)"
- "traefik.http.routers.buscribe-redirect.entrypoints=web"
- "traefik.http.routers.buscribe-router.tls=true"
- "traefik.http.routers.buscribe-router.tls.certresolver=leresolver"
- "traefik.http.middlewares.buscribe-redirectscheme.redirectscheme.scheme=https"
- "traefik.http.middlewares.buscribe-redirectscheme.redirectscheme.permanent=true"
- "traefik.http.routers.buscribe-redirect.middlewares=buscribe-redirectscheme@docker"
restart: "on-failure"
# buscribelrr:
# image: buscribe:0.0.0
# command: [ "loadingreadyrun",
# "--start-time=2022-11-11T12:00:00Z",
# "--end-time=2022-11-20T22:00:00Z",
# "--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_lrr",
# "--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
# volumes:
# - /srv/wubloader/segments:/mnt
# buscribedb:
# image: buscribe:0.0.0
# command: [ "desertbus",
# "--start-time=2023-11-10T12:00:00Z",
# "--end-time=2023-11-15T00:00:00Z",
# "--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
# "--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
# volumes:
# - /srv/wubloader/segments:/mnt
buscribedb0:
image: buscribe:0.0.0
command: [ "desertbus",
"--start-time-override=2023-11-19T00:00:00Z",
"--end-time=2023-11-19T06:00:00Z",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
volumes:
- /srv/wubloader/segments:/mnt
buscribedb1:
image: buscribe:0.0.0
command: [ "desertbus",
"--start-time-override=2023-11-18T06:00:00Z",
"--end-time=2023-11-18T12:00:00Z",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
volumes:
- /srv/wubloader/segments:/mnt
buscribedb2:
image: buscribe:0.0.0
command: [ "desertbus",
"--start-time-override=2023-11-18T12:00:00Z",
"--end-time=2023-11-18T18:00:00Z",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
volumes:
- /srv/wubloader/segments:/mnt
buscribedb3:
image: buscribe:0.0.0
command: [ "desertbus",
"--start-time-override=2023-11-18T18:00:00Z",
"--end-time=2023-11-19T00:00:00Z",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--model=/usr/share/buscribe/vosk-model-en-us-0.22/" ]
volumes:
- /srv/wubloader/segments:/mnt
# buscribeapilrr:
# image: buscribe-api:0.0.0
# command: [
# "loadingreadyrun",
# "--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_lrr",
# "--bustime-start=2023-11-11T22:00:00Z" ]
buscribeapidb:
image: buscribe-api:0.0.0
command: [
"desertbus",
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--bustime-start=2023-11-11T22:00:00Z" ]
volumes:
- /srv/wubloader/segments:/mnt
professorapidb:
image: professor-api:0.0.0
command: [
"--database=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_db",
"--bustime-start=2023-11-11T22:00:00Z" ]
postgres:
image: postgres:13
ports:
- "7654:5432"
environment:
- POSTGRES_USER=vst
- POSTGRES_DB=postgres
- POSTGRES_PASSWORD=flnMSYPRf
volumes:
- /srv/buscribe/postgres:/var/lib/postgresql/data
restart: "unless-stopped"
postgres-prometheus:
image: quay.io/prometheuscommunity/postgres-exporter
ports:
- "9187:9187"
environment:
- DATA_SOURCE_NAME=postgresql://vst:flnMSYPRf@postgres:5432/buscribe_lrr?sslmode=disable
networks:
wubloader_default:
external: true
traefik_network:
external: true

@ -1,5 +0,0 @@
FROM node:17-alpine
RUN npm install less -g
ENTRYPOINT ["lessc"]

@ -1,5 +0,0 @@
FROM nginx:latest
COPY buscribe-web /usr/share/nginx/html/buscribe
COPY professor /usr/share/nginx/html/professor
COPY nginx/nginx.conf /etc/nginx/nginx.conf

@ -1,57 +0,0 @@
user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log notice;
pid /var/run/nginx.pid;
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
access_log /var/log/nginx/access.log main;
sendfile on;
#tcp_nopush on;
keepalive_timeout 65;
gzip on;
gzip_comp_level 9;
absolute_redirect off;
server {
listen 80;
server_name localhost;
#access_log /var/log/nginx/host.access.log main;
location / { proxy_pass http://nginx; }
location /buscribelrr {
alias /usr/share/nginx/html/buscribe;
}
location /buscribe {
alias /usr/share/nginx/html/buscribe;
}
location /professor {
alias /usr/share/nginx/html/professor;
}
#location /buscribe/loadingreadyrun/json { proxy_pass http://buscribeapilrr:8010/buscribe/json; }
location /buscribe/desertbus/json { proxy_pass http://buscribeapidb:8010/buscribe/json; }
location /professor/desertbus { proxy_pass http://professorapidb:8011/professor; }
}
}

@ -13,10 +13,10 @@ from professor_api.professor_api import app
def cors(app):
"""WSGI middleware that sets CORS headers"""
HEADERS = [
("Access-Control-Allow-Credentials", "true"),
("Access-Control-Allow-Headers", "content-type"),
("Access-Control-Allow-Credentials", "false"),
("Access-Control-Allow-Headers", "*"),
("Access-Control-Allow-Methods", "GET,HEAD,POST,PUT"),
("Access-Control-Allow-Origin", "http://localhost:63342,https://wubloader.raptorpond.com"),
("Access-Control-Allow-Origin", "*"),
("Access-Control-Expose-Headers", "*"),
("Access-Control-Max-Age", "86400"),
]
@ -45,7 +45,7 @@ def servelet(server):
'postgresql://USER:PASSWORD@HOST/DBNAME?KEY=VALUE')
@argh.arg('--bustime-start',
help='The start time in UTC for the event, for UTC-Bustime conversion')
def main(database="", host='0.0.0.0', port=8011, bustime_start=None):
def main(database="", host='0.0.0.0', port=8005, bustime_start=None):
if bustime_start is None:
logging.error("Missing --bustime-start!")
exit(1)

@ -1,7 +1,5 @@
import re
import urllib.parse
from functools import wraps
from random import randrange
import flask
import gevent
@ -10,51 +8,9 @@ from flask import jsonify, request, copy_current_request_context
from gevent import sleep
from psycopg2.extras import execute_values
from google.oauth2 import id_token
from google.auth.transport import requests
app = flask.Flask('buscribe')
def authenticate(f):
"""Authenticate a token against the database.
Reference: https://developers.google.com/identity/sign-in/web/backend-auth
https://developers.google.com/identity/gsi/web/guides/verify-google-id-token#using-a-google-api-client-library"""
@wraps(f)
def auth_wrapper(*args, **kwargs):
try:
user_token = request.cookies.get("credentials")
print(user_token)
except (KeyError, TypeError):
return 'User token required', 401
try:
idinfo = id_token.verify_oauth2_token(user_token, requests.Request(),
"164084252563-kaks3no7muqb82suvbubg7r0o87aip7n.apps.googleusercontent.com")
if idinfo['iss'] not in ['accounts.google.com', 'https://accounts.google.com']:
raise ValueError('Wrong issuer.')
except ValueError:
return 'Invalid token. Access denied.', 403
# check whether user is in the database
email = idinfo['email'].lower()
conn = app.db_manager.get_conn()
results = database.query(conn, """
SELECT email
FROM buscribe_verifiers
WHERE lower(email) = %s""", email)
row = results.fetchone()
if row is None:
return 'Unknown user. Access denied.', 403
return f(*args, editor=email, **kwargs)
return auth_wrapper
@app.route('/professor/line/<int:line_id>', methods=["GET"])
def get_line(line_id):
db_conn = app.db_manager.get_conn()
@ -64,27 +20,7 @@ def get_line(line_id):
if line is None:
return "Line not found.", 404
else:
return {"id": line.id,
"start_time": line.start_time.isoformat(),
"end_time": line.end_time.isoformat(),
"line_data": line.transcription_json}
@app.route('/professor/line/random', methods=["GET"])
def get_random_line():
db_conn = app.db_manager.get_conn()
n_lines = database.query(db_conn, "SELECT count(*) AS n_lines FROM buscribe_transcriptions;").fetchone().n_lines
row = randrange(n_lines)
line = database.query(db_conn, "SELECT * FROM buscribe_transcriptions OFFSET %(row)s LIMIT 1;", row=row).fetchone()
if line is None:
return "Line not found.", 404
else:
return {"id": line.id,
"start_time": line.start_time.isoformat(),
return {"start_time": line.start_time.isoformat(),
"end_time": line.end_time.isoformat(),
"line_data": line.transcription_json}
@ -106,13 +42,12 @@ def get_playlist(line_id):
#EXT-X-TARGETDURATION:{duration.total_seconds()}
#EXT-X-PROGRAM-DATE-TIME:{start_time_iso}
#EXTINF:{duration.total_seconds()}
/cut/desertbus/source.ts?start={urllib.parse.quote_plus(start_time_iso)}&end={urllib.parse.quote_plus(end_time_iso)}&type=rough&allow_holes=true
//localhost/cut/desertbus/source.ts?start={urllib.parse.quote_plus(start_time_iso)}&end={urllib.parse.quote_plus(end_time_iso)}&type=rough&allow_holes=true
#EXT-X-ENDLIST"""
@app.route('/professor/line/<int:line_id>', methods=["POST"])
@authenticate
def update_line(line_id, editor):
def update_line(line_id):
db_conn = app.db_manager.get_conn()
if "speakers" in request.json and \
@ -121,11 +56,11 @@ def update_line(line_id, editor):
# Simpler than dealing with uniqueness
database.query(db_conn,
"DELETE FROM buscribe_line_speakers WHERE line = %(line_id)s AND verifier = %(verifier)s;",
line_id=line_id, verifier=editor)
line_id=line_id, verifier="placeholder@example.com")
execute_values(db_conn.cursor(),
"INSERT INTO buscribe_line_speakers(line, speaker, verifier) "
"VALUES %s;",
[(line_id, speaker, editor) for speaker in
[(line_id, speaker, "placeholder@example.com") for speaker in
request.json["speakers"]])
if "transcription" in request.json and \
isinstance(request.json["transcription"], str) and \
@ -135,11 +70,11 @@ def update_line(line_id, editor):
database.query(db_conn,
"DELETE FROM buscribe_verified_lines WHERE line = %(line_id)s AND verifier = %(verifier)s;",
line_id=line_id, verifier=editor)
line_id=line_id, verifier="placeholder@example.com")
database.query(db_conn,
"INSERT INTO buscribe_verified_lines(line, verified_line, verifier) "
"VALUES (%(line)s, %(verified_line)s, %(verifier)s)",
line=line_id, verified_line=verified_line, verifier=editor)
line=line_id, verified_line=verified_line, verifier="placeholder@example.com")
return "", 204
@ -166,8 +101,7 @@ def get_speaker(speaker_id):
@app.route('/professor/speaker', methods=["PUT"])
@authenticate
def new_speaker(editor=None):
def new_speaker():
name = request.json
if not isinstance(name, str):

@ -11,7 +11,6 @@ setup(
"psycogreen",
"wubloader-common",
"python-dateutil",
"flask",
"google-auth"
"flask"
],
)

File diff suppressed because one or more lines are too long

@ -5,6 +5,7 @@
<title>Buscribe -- Professor</title>
<link href="video.js/dist/video-js.min.css" rel="stylesheet">
<!-- <link href="videojs-hls-quality-selector/dist/videojs-hls-quality-selector.css" rel="stylesheet">-->
<link href="jquery-ui-1.13.0.custom/jquery-ui.css" rel="stylesheet">
<link href="style.css" rel="stylesheet">
@ -12,10 +13,12 @@
<script src="jquery-ui-1.13.0.custom/external/jquery/jquery.js"></script>
<script src="jquery-ui-1.13.0.custom/jquery-ui.js"></script>
<script src="hotkeys.min.js"></script>
<script src="script.js"></script>
<!-- <script src="videojs-contrib-quality-levels/dist/videojs-contrib-quality-levels.min.js"></script>-->
<!-- <script src="videojs-hls-quality-selector/dist/videojs-hls-quality-selector.min.js"></script>-->
</head>
<body onload="pageReady()">
@ -43,13 +46,7 @@
<button id="submit_button" onclick="submit()" type="button">Submit</button><span id="update_indicator"></span>
<div id="googleLoginButton" style="display: none"></div>
<div id="logout" style="display: none"><a href="javascript:doLogout()">Log out</a></div>
<script src="video.js/dist/video.min.js"></script>
<script src="https://accounts.google.com/gsi/client" async defer></script>
<script>
window.onGoogleLibraryLoad = doGoogle
</script>
</body>
</html>

@ -1,13 +1,7 @@
function pageReady() {
const params = new URLSearchParams(document.location.search.substring(1));
let line_id;
if (params.get("line") !== "random") {
line_id = parseInt(params.get("line"), 10);
} else {
line_id = "random"
}
line_id = parseInt(params.get("line"), 10);
videojs("player", {
// src: "test.m3u8",
@ -33,61 +27,15 @@ function pageReady() {
const bgOpacitySelector = document.querySelector('.vjs-bg-opacity > select');
bgOpacitySelector.value = "0.5"
fetch(`/professor/desertbus/line/${line_id}`)
fetch(`//localhost:8005/professor/line/${line_id}`)
.then(response => response.json())
.then(fillLineInfo)
.then(initializePlayer);
handleLoginState();
}
hotkeys('ctrl+enter', function (event, handler){
document.getElementById("submit_button").click();
});
function handleLoginState() {
if (document.cookie.split('; ').find(row => row.startsWith('credentials='))) {
document.getElementById("logout").style.display = "";
} else {
document.getElementById("googleLoginButton").style.display = "";
}
}
function doGoogle() {
google.accounts.id.initialize({
client_id: "164084252563-kaks3no7muqb82suvbubg7r0o87aip7n.apps.googleusercontent.com",
callback: loggedIn,
auto_select: true
});
google.accounts.id.renderButton(
document.getElementById("googleLoginButton"),
{theme: "outline", size: "large"} // customization attributes
);
google.accounts.id.prompt(); // also display the One Tap dialog
}
function doLogout() {
document.cookie = `credentials=;expires=Thu, 01 Jan 1970 00:00:01 GMT`;
document.getElementById("googleLoginButton").style.display = "";
document.getElementById("logout").style.display = "none";
}
function loggedIn(response) {
document.cookie = `credentials=${response.credential}`;
document.getElementById("googleLoginButton").style.display = "none";
document.getElementById("logout").style.display = "";
console.log(response);
}
function fillLineInfo(line_json) {
line_id = line_json.id
// document.getElementById("original_transcription").innerText = line_json.line_data.text;
line = line_json
document.getElementById("original_transcription").innerHTML = line_json.line_data.result
.map(word => `<span style="opacity: ${word.conf}">${word.word}</span>`).join(" ");
@ -97,12 +45,11 @@ function fillLineInfo(line_json) {
function initializePlayer() {
videojs.getPlayer("player").src([
//{src: `/professor/desertbus/line/${line_id}/playlist.m3u8`}
{src: `/playlist/desertbus/source.m3u8?start=${line.start_time}&end=${line.end_time}`}
{src: `//localhost:8005/professor/line/${line_id}/playlist.m3u8`}
]);
videojs.getPlayer("player").addRemoteTextTrack({
kind: "captions",
src: `/buscribe/desertbus/vtt?start_time=${line.start_time}&end_time=${line.end_time}`,
src: `//localhost:8010/buscribe/vtt?start_time=${line.start_time}&end_time=${line.end_time}`,
srclang: "en",
label: "English",
default: true
@ -126,28 +73,26 @@ async function submit() {
}
}
return await fetch("/professor/desertbus/speaker",
return await fetch("//localhost:8005/professor/speaker",
{
method: "PUT",
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(speaker),
credentials: "include"
body: JSON.stringify(speaker)
}).then(response =>
parseInt(response.headers.get("Content-Location")
.split("/")
.pop(), 10));
}));
fetch(`/professor/desertbus/line/${line_id}`,
fetch(`//localhost:8005/professor/line/${line_id}`,
{
method: "POST",
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({transcription: new_transcription, speakers: new_speakers}),
credentials: "include"
body: JSON.stringify({transcription: new_transcription, speakers: new_speakers})
}).then(response => {
if (response.ok) {
document.getElementById("update_indicator").innerText = "\u2714\ufe0f"
@ -158,7 +103,7 @@ async function submit() {
}
$(function () {
fetch("/professor/desertbus/speaker")
fetch("//localhost:8005/professor/speaker")
.then(response => response.json())
.then(function (speakers_json) {
speakers = speakers_json;
@ -208,16 +153,3 @@ $(function () {
)
});
function parseJwt(token) {
const base64Url = token.split('.')[1];
const base64 = base64Url.replace(/-/g, '+').replace(/_/g, '/');
const jsonPayload = decodeURIComponent(
atob(base64)
.split('')
.map(function (c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
return JSON.parse(jsonPayload);
}

@ -61,10 +61,3 @@ button {
span.verified_cc {
color: #c1ffc1;
}
#logout {
padding: 0.1em;
a {
color: darkgray
}
}

@ -1,12 +0,0 @@
#!/bin/bash
docker run \
--rm \
-v /srv/wubloader/segments/:/mnt/ \
buscribe:0.0.0 \
loadingreadyrun \
--start-time='2021-11-05T00:00' \
--end-time='2021-11-07T00:00' \
--database=postgresql://vst:flnMSYPRf@mula.lan:6543/buscribe_lrr \
--model=/usr/share/buscribe/vosk-model-en-us-0.22/
# --model=/usr/share/buscribe/vosk-model-small-en-us-0.15/
Loading…
Cancel
Save