mirror of https://github.com/ekimekim/wubloader
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
104 lines
3.7 KiB
Python
104 lines
3.7 KiB
Python
import json
|
|
import logging
|
|
import subprocess
|
|
from datetime import timedelta, datetime
|
|
|
|
from gevent.event import Event
|
|
from psycopg2._psycopg import cursor
|
|
|
|
from buscribe.recognizer import BuscribeRecognizer
|
|
|
|
|
|
class HitMissingSegment(Exception):
|
|
pass
|
|
|
|
|
|
def transcribe_segments(segments: list, sample_rate: int, recognizer: BuscribeRecognizer, start_of_transcript: datetime,
|
|
db_cursor: cursor, stopping: Event):
|
|
"""Starts transcribing from a list of segments.
|
|
|
|
Only starts committing new lines to the database after reaching start_of_transcript.
|
|
|
|
The recognizer must be initialized to sample_rate and have start time set.
|
|
|
|
Returns the end time of the last transcribed line."""
|
|
|
|
segments_end_time = segments[0].start
|
|
|
|
for segment in segments:
|
|
|
|
if segment is None:
|
|
return segments_end_time
|
|
|
|
segments_end_time += segment.duration
|
|
|
|
process = subprocess.Popen(['ffmpeg',
|
|
'-loglevel', 'quiet',
|
|
'-i', segment.path,
|
|
'-ar', str(sample_rate),
|
|
'-ac', '1', # TODO: Check for advanced downmixing
|
|
'-f', 's16le', '-'],
|
|
stdout=subprocess.PIPE)
|
|
while True:
|
|
data = process.stdout.read(16000)
|
|
if len(data) == 0:
|
|
break
|
|
if recognizer.accept_waveform(data):
|
|
result_json = json.loads(recognizer.result())
|
|
logging.debug(json.dumps(result_json, indent=2))
|
|
|
|
if result_json["text"] == "":
|
|
continue
|
|
|
|
line_start_time = recognizer.segments_start_time + timedelta(seconds=result_json["result"][0]["start"])
|
|
line_end_time = recognizer.segments_start_time + timedelta(seconds=result_json["result"][-1]["end"])
|
|
|
|
if line_start_time > start_of_transcript:
|
|
write_line(result_json, line_start_time, line_end_time, db_cursor)
|
|
|
|
if stopping.is_set():
|
|
return segments_end_time
|
|
|
|
return segments_end_time
|
|
|
|
|
|
def write_line(line_json: dict, line_start_time: datetime, line_end_time: datetime, db_cursor):
|
|
"""Commits line to the database"""
|
|
db_cursor.execute(
|
|
"INSERT INTO buscribe_transcriptions("
|
|
"start_time, "
|
|
"end_time, "
|
|
"transcription_line, "
|
|
"line_speaker, "
|
|
"transcription_json) VALUES (%s, %s ,%s, %s, %s)",
|
|
(line_start_time,
|
|
line_end_time,
|
|
line_json["text"],
|
|
line_json["spk"] if "spk" in line_json else None,
|
|
json.dumps(line_json)
|
|
)
|
|
)
|
|
|
|
|
|
def get_end_of_transcript(db_cursor):
|
|
"""Grab the end timestamp of the current transcript.
|
|
|
|
If there is no existing transcript returns default; used for cold starts."""
|
|
db_cursor.execute("SELECT end_time FROM buscribe_transcriptions ORDER BY end_time DESC LIMIT 1")
|
|
end_of_transcript_row = db_cursor.fetchone()
|
|
|
|
return end_of_transcript_row.end_time if end_of_transcript_row is not None else None
|
|
|
|
|
|
def finish_off_recognizer(recognizer: BuscribeRecognizer, db_cursor):
|
|
"""Flush the recognizer, commit the final line to the database and reset it."""
|
|
final_result_json = json.loads(recognizer.final_result()) # Flush the tubes
|
|
|
|
if "result" in final_result_json:
|
|
line_start_time = recognizer.segments_start_time + timedelta(seconds=final_result_json["result"][0]["start"])
|
|
line_end_time = recognizer.segments_start_time + timedelta(seconds=final_result_json["result"][-1]["end"])
|
|
|
|
write_line(final_result_json, line_start_time, line_end_time, db_cursor)
|
|
|
|
recognizer.reset()
|