escher/api/escher_api/escher.py


from collections import namedtuple
import flask as flask
from datetime import datetime
from common import database

from gevent.pool import Pool

from psycopg2.extras import execute_values

app = flask.Flask('escher')


class Result:
    def __init__(self, transcript=None, vst=None, chat=None):
        self.transcript = transcript if transcript else []
        self.vst = vst
        self.chat = chat if chat else []

    def __repr__(self) -> str:
        return f'Result(transcript={self.transcript}, vst={self.vst}, chat={self.chat})'

    @property
    def start_time(self):
        """Compute the start time of the whole result."""
        start_times = [self.vst.start_time] if self.vst else []

        if self.transcript:
            start_times.append(self.transcript[0].start_time)
        if self.chat:
            start_times.append(self.chat[0].pub_time)

        return min(start_times)

    @property
    def end_time(self):
        """Compute the start time of the whole result."""
        end_times = [self.vst.end_time] if self.vst else []

        if self.transcript:
            end_times.append(self.transcript[-1].end_time)
        if self.chat:
            end_times.append(self.chat[-1].pub_time)

        return max(end_times)

    @property
    def weight(self):
        return sum([cl.rank for cl in self.chat], 0) + \
            10 * sum([tl.rank for tl in self.transcript], 0) + \
            20 * (self.vst.rank if self.vst else 0)


def get_transcript(db_conn, ts_query, start_time="-infinity", end_time="infinity"):
    query = """
    --sql
    WITH q AS (
        SELECT convert_query(%(ts_query)s)
    ),
    relevant_lines AS (
        (
            SELECT id
            FROM buscribe_transcriptions
            WHERE to_tsvector('english', transcription_line) @@ (SELECT * FROM q)
        )
        UNION
        (
            SELECT line
            FROM buscribe_verified_lines
            WHERE to_tsvector('english', verified_line) @@ (SELECT * FROM q)
        )
        UNION
        (
            SELECT line
            FROM buscribe_line_speakers
            WHERE to_tsvector('english', speaker_name) @@ (SELECT * FROM q)
        )
        UNION
        (
            SELECT line
            FROM buscribe_line_inferred_speakers
            WHERE to_tsvector('english', speaker_name) @@ (SELECT * FROM q)
        )
    ) 
    (
        (
            SELECT id,
                start_time,
                end_time,
                null AS verifier,
                names,
                transcription_line,
                ts_rank_cd(
                    coalesce(
                        to_tsvector('english', transcription_line),
                        ''::tsvector
                    ) || coalesce(
                        to_tsvector(array_to_string(names, ' ')),
                        ''::tsvector
                    ), 
                    (SELECT * FROM q)
                ) AS rank,
                ts_headline(
                    transcription_line,
                    (SELECT * FROM q),
                    'StartSel=''<span class=\"highlight\">'', StopSel=</span>'
                ) AS highlighted_text
            --    transcription_json
            FROM buscribe_transcriptions
                LEFT OUTER JOIN (
                    SELECT line,
                        ARRAY(
                            SELECT speaker_name
                            FROM buscribe_line_inferred_speakers AS inner_speakers
                            WHERE inner_speakers.line = buscribe_line_inferred_speakers.line
                        ) AS names
                    FROM buscribe_line_inferred_speakers
                ) AS inferred_speakers ON id = inferred_speakers.line
            WHERE id IN (
                    SELECT id
                    FROM relevant_lines
                )
                AND start_time >= coalesce(%(start_time)s, '-infinity'::timestamp)
                AND end_time <= coalesce(%(end_time)s, 'infinity'::timestamp)
        )
        UNION
        (
            SELECT buscribe_transcriptions.id AS id,
                start_time,
                end_time,
                cverifier AS verifier,
                names,
                coalesce(
                    verifications.verified_line,
                    buscribe_transcriptions.transcription_line
                ) AS transcription_line,
                ts_rank_cd(
                    coalesce(
                        setweight(to_tsvector('english', verified_line), 'C'),
                        to_tsvector(
                            'english',
                            buscribe_transcriptions.transcription_line
                        ),
                        ''::tsvector
                    ) || coalesce(
                        setweight(to_tsvector(array_to_string(names, ' ')), 'C'),
                        ''::tsvector
                    ),
                    (SELECT * FROM q)
                ) AS rank,
                ts_headline(
                    coalesce(
                        verifications.verified_line,
                        buscribe_transcriptions.transcription_line
                    ),
                    (SELECT * FROM q),
                    'StartSel=''<span class=\"highlight\">'', StopSel=</span>'
                ) AS highlighted_text
                -- null AS transcription_json
            FROM buscribe_transcriptions
                INNER JOIN (
                    SELECT *,
                        coalesce(relevant_verified.line, relevant_speakers.line) AS cline,
                        coalesce(
                            relevant_verified.verifier,
                            relevant_speakers.verifier
                        ) AS cverifier
                    FROM (
                            SELECT *
                            FROM buscribe_verified_lines
                            WHERE line IN (
                                    SELECT id
                                    FROM relevant_lines
                                )
                        ) AS relevant_verified
                        FULL OUTER JOIN (
                            SELECT line,
                                verifier,
                                ARRAY(
                                    SELECT speaker_name
                                    FROM buscribe_line_speakers AS inner_speakers
                                    WHERE inner_speakers.line = buscribe_line_speakers.line
                                        AND inner_speakers.verifier = buscribe_line_speakers.verifier
                                ) AS names
                            FROM buscribe_line_speakers
                            WHERE line IN (
                                    SELECT id
                                    FROM relevant_lines
                                )
                        ) AS relevant_speakers ON relevant_verified.line = relevant_speakers.line
                        AND relevant_speakers.verifier = relevant_verified.verifier
                ) AS verifications ON id = verifications.cline
            WHERE start_time >= coalesce(%(start_time)s, '-infinity'::timestamp)
                AND end_time <= coalesce(%(end_time)s, 'infinity'::timestamp)
        )
    )
    ORDER BY 
        --rank DESC,
        start_time;
    """

    db_results = database.query(db_conn, query,
                                start_time=start_time if start_time is not None else '-infinity',
                                end_time=end_time if end_time is not None else 'infinity',
                                ts_query=ts_query)

    # Number of messages
    n_m = db_results.rowcount

    # Get duration for message frequency calculation
    bus_duration = database.query(db_conn,
                                  """
                                    --sql
                                    SELECT EXTRACT(EPOCH FROM max(end_time) - min(start_time)) AS bus_duration FROM buscribe_transcriptions;
                                    """
                                  ).fetchone().bus_duration

    # Priors saying that an interesting event (p < 0.01) is when three messages per minute are posted, giving the
    # equivalent average 13 messages per minute
    a_p = 13.0
    b_p = 60.0

    # 0.02th quantile difference (cf. chat)
    p = 0.02
    l = b_p + bus_duration
    a = a_p + n_m

    # Lomax distribution is posterior predictive for exponential
    message_duration_diff = l * ((1 - p)**-(1/a) - 1)

    current_result = Result()
    results = []
    # print(message_duration_diff)
    for transcript_line in db_results:

        # Current result set is new
        if not current_result.transcript:
            current_result.transcript.append(transcript_line)
        # New message is within window
        elif (transcript_line.start_time - current_result.transcript[-1].end_time).total_seconds() <= message_duration_diff:
            # print((transcript_line.start_time -
            #       current_result.transcript[-1].end_time).total_seconds())
            current_result.transcript.append(transcript_line)
        # New message is outside window
        else:
            # Always save the run (cf. chat; we save all lines)
            results.append(current_result)
            # Start new run
            current_result = Result(transcript=[transcript_line])

    results.append(current_result)

    return results


def get_vst(db_conn, ts_query, start_time="-infinity", end_time="infinity"):
    query = """
    --sql
    SELECT 
	    video_ranges[1].start AS start_time,
	    video_ranges[1].end AS end_time,
	    video_title AS title,
	    video_id AS id,
	    ts_rank_cd(
		    setweight(
			    to_tsvector('english'::regconfig, video_title), 
			    'C'), 
		    websearch_to_tsquery(%(ts_query)s)
        ) AS rank
    FROM events
    WHERE 
        video_ranges[1].start >= %(start_time)s AND
        video_ranges[1].end <= %(end_time)s AND
    	array_length(video_ranges, 1) = 1 AND -- Only handling single-cut videos for now
    	to_tsvector('english'::regconfig, video_title) @@ websearch_to_tsquery(%(ts_query)s)
    ORDER BY
	    --rank DESC,
        start_time
    LIMIT 100; -- Hard limit on result number
    """

    results = database.query(db_conn, query,
                             start_time=start_time if start_time is not None else '-infinity',
                             end_time=end_time if end_time is not None else 'infinity',
                             ts_query=ts_query)

    return [Result(vst=result) for result in results]


def get_chat(db_conn, ts_query, start_time="-infinity", end_time="infinity"):
    query = """
    --sql
    SELECT * FROM (
        SELECT 
	       	pub_time,
	    	content->'tags'->>'display-name' AS name,
	    	content->'params'->>1 AS content,
	        ts_rank_cd(
	    	    setweight(
	    		    to_tsvector('english'::regconfig, content->'params'->>1), 
	    		    'A'), 
	    	    websearch_to_tsquery(%(ts_query)s)
            ) AS rank
        FROM chat
        WHERE 
            pub_time >= %(start_time)s AND
            pub_time <= %(end_time)s AND
         	to_tsvector('english'::regconfig, content->'params'->>1) @@ websearch_to_tsquery(%(ts_query)s)
        --ORDER BY
        --    rank DESC
        --LIMIT 100 -- Hard limit on result number
    ) AS chat ORDER BY pub_time; -- Re-sort by time for merging
    """

    db_results = database.query(db_conn, query,
                                start_time=start_time if start_time is not None else '-infinity',
                                end_time=end_time if end_time is not None else 'infinity',
                                ts_query=ts_query)

    # Number of messages
    n_m = db_results.rowcount

    # Get duration for message frequency calculation
    bus_duration = database.query(db_conn,
                                  """
                                    --sql
                                    SELECT EXTRACT(EPOCH FROM max(pub_time) - min(pub_time)) AS bus_duration FROM chat;
                                    """
                                  ).fetchone().bus_duration

    # Priors saying that an interesting event (p < 0.01) is when three messages per minute are posted, giving the
    # equivalent average 13 messages per minute
    a_p = 13.0
    b_p = 60.0

    # 0.01th quantile difference
    p = 0.01
    l = b_p + bus_duration
    a = a_p + n_m
    # Lomax distribution is posterior predictive for exponential
    message_duration_diff = l * ((1 - p)**-(1/a) - 1)

    current_result = Result()
    results = []
    for chat_line in db_results:
        # Current result set is new
        if not current_result.chat:
            current_result.chat.append(chat_line)
        # New message is within window
        elif (chat_line.pub_time - current_result.chat[-1].pub_time).total_seconds() <= message_duration_diff:
            current_result.chat.append(chat_line)
        # New message is outside window
        else:
            # Current run has more than one message: save it
            if len(current_result.chat) > 1:
                results.append(current_result)
            # Start new run
            current_result = Result(chat=[chat_line])

    if len(current_result.chat) > 1:
        results.append(current_result)

    return results


def load_results_data(db_conn, results):
    """
    Replace chat and transcript with all entries in result's timeframe.
    """
    
    # ggroup = Pool(size=30)

    # results = ggroup.map(lambda result: load_result_data(db_manager, result), results)

    result_timespans = [(i, result.start_time, result.end_time) for (i, result) in enumerate(results)]
    
    # Clear lists so we can later insert new lines
    for result in results:
        result.chat = []
        result.transcript = []


    cur = db_conn.cursor()

    execute_values(cur,
        """
        --sql
        WITH timespans (id, start_time, end_time) AS (VALUES %s)
        SELECT 
            timespans.id,
	        pub_time,
	        content->'tags'->>'display-name' AS name,
	        content->'params'->>1 AS content FROM timespans JOIN chat ON (pub_time BETWEEN start_time AND end_time);
        """,
        result_timespans
    )

    for chat_line in cur:
        results[chat_line.id].chat.append(chat_line)

    execute_values(cur,
        """
        --sql
        WITH timespans (id, start_time, end_time) AS (VALUES %s)
        SELECT 
            timespans.id,
            buscribe_transcriptions.start_time,
            buscribe_transcriptions.end_time,
            names,
            buscribe_transcriptions.transcription_line
        FROM timespans JOIN
            buscribe_transcriptions ON (
                buscribe_transcriptions.start_time >= timespans.start_time AND buscribe_transcriptions.start_time <= timespans.end_time AND
                buscribe_transcriptions.end_time >= timespans.start_time AND buscribe_transcriptions.end_time <= timespans.end_time
                )
            LEFT OUTER JOIN (
                SELECT line,
                    ARRAY(
                        SELECT speaker_name
                        FROM buscribe_line_inferred_speakers AS inner_speakers
                        WHERE inner_speakers.line = buscribe_line_inferred_speakers.line
                    ) AS names
                FROM buscribe_line_inferred_speakers
            ) AS inferred_speakers ON buscribe_transcriptions.id = inferred_speakers.line;
        """,
        result_timespans
    )

    for transcript_line in cur:
        results[transcript_line.id].transcript.append(transcript_line)

    return results


def merge_results(transcript: list[Result], vst: list[Result], chat: list[Result], limit: int, offset = 0):
    """
    Merge different types of results in order of importance.

    First, merge anything that overlaps with a VST clip into the clip's result.

    Second, merge any chat the overlaps with transcripts into the transcript.

    Finally, append remaining chats.
    """

    transcript_i = 0
    chat_i = 0

    # Merge transcript and chat into vst
    for vst_result in vst:
        while transcript_i < len(transcript) and transcript[transcript_i].start_time < vst_result.end_time:
            if overlap(vst_result, transcript[transcript_i]):
                vst_result.transcript.extend(
                    transcript.pop(transcript_i).transcript)
            else:
                transcript_i += 1

        while chat_i < len(chat) and chat[chat_i].start_time < vst_result.end_time:
            if overlap(vst_result, chat[chat_i]):
                vst_result.chat.extend(chat.pop(chat_i).chat)
            else:
                chat_i += 1

    chat_i = 0

    # Merge chat into transcript
    for transcript_result in transcript:
        while chat_i < len(chat) and chat[chat_i].start_time < transcript_result.end_time:
            if overlap(transcript_result, chat[chat_i]):
                transcript_result.chat.extend(chat.pop(chat_i).chat)
            else:
                chat_i += 1

    merged = transcript + vst + chat
    merged.sort(key=lambda result: result.start_time)
    merged.sort(key=lambda result: result.weight, reverse=True)
    return merged[offset:min((offset + limit), len(merged))]


def overlap(result_a, result_b):
    """
    A |---------|
    B   |---|

    or

    A   |-----|
    B |---|

    or

    A |---|
    B   |---|
    """
    return result_b.start_time >= result_a.start_time and result_b.start_time <= result_a.end_time or \
        result_b.end_time >= result_a.start_time and result_b.end_time <= result_a.end_time
api init 2 years ago
result finalization 2 years ago			`from collections import namedtuple`
api init 2 years ago			`import flask as flask`
			`from datetime import datetime`
			`from common import database`

result finalization 2 years ago			`from gevent.pool import Pool`

			`from psycopg2.extras import execute_values`

api init 2 years ago			`app = flask.Flask('escher')`


			`class Result:`
			`def __init__(self, transcript=None, vst=None, chat=None):`
			`self.transcript = transcript if transcript else []`
			`self.vst = vst`
			`self.chat = chat if chat else []`

			`def __repr__(self) -> str:`
			`return f'Result(transcript={self.transcript}, vst={self.vst}, chat={self.chat})'`

			`@property`
			`def start_time(self):`
			`"""Compute the start time of the whole result."""`
			`start_times = [self.vst.start_time] if self.vst else []`

			`if self.transcript:`
			`start_times.append(self.transcript[0].start_time)`
			`if self.chat:`
			`start_times.append(self.chat[0].pub_time)`

			`return min(start_times)`

			`@property`
			`def end_time(self):`
			`"""Compute the start time of the whole result."""`
			`end_times = [self.vst.end_time] if self.vst else []`

			`if self.transcript:`
			`end_times.append(self.transcript[-1].end_time)`
			`if self.chat:`
			`end_times.append(self.chat[-1].pub_time)`

			`return max(end_times)`

			`@property`
			`def weight(self):`
			`return sum([cl.rank for cl in self.chat], 0) + \`
			`10 * sum([tl.rank for tl in self.transcript], 0) + \`
			`20 * (self.vst.rank if self.vst else 0)`


			`def get_transcript(db_conn, ts_query, start_time="-infinity", end_time="infinity"):`
			`query = """`
			`--sql`
			`WITH q AS (`
			`SELECT convert_query(%(ts_query)s)`
			`),`
			`relevant_lines AS (`
			`(`
			`SELECT id`
			`FROM buscribe_transcriptions`
			`WHERE to_tsvector('english', transcription_line) @@ (SELECT * FROM q)`
			`)`
			`UNION`
			`(`
			`SELECT line`
			`FROM buscribe_verified_lines`
			`WHERE to_tsvector('english', verified_line) @@ (SELECT * FROM q)`
			`)`
			`UNION`
			`(`
			`SELECT line`
			`FROM buscribe_line_speakers`
			`WHERE to_tsvector('english', speaker_name) @@ (SELECT * FROM q)`
			`)`
			`UNION`
			`(`
			`SELECT line`
			`FROM buscribe_line_inferred_speakers`
			`WHERE to_tsvector('english', speaker_name) @@ (SELECT * FROM q)`
			`)`
			`)`
			`(`
			`(`
			`SELECT id,`
			`start_time,`
			`end_time,`
			`null AS verifier,`
			`names,`
			`transcription_line,`
			`ts_rank_cd(`
			`coalesce(`
			`to_tsvector('english', transcription_line),`
			`''::tsvector`
			`) \|\| coalesce(`
			`to_tsvector(array_to_string(names, ' ')),`
			`''::tsvector`
			`),`
			`(SELECT * FROM q)`
			`) AS rank,`
			`ts_headline(`
			`transcription_line,`
			`(SELECT * FROM q),`
			`'StartSel=''<span class=\"highlight\">'', StopSel=</span>'`
			`) AS highlighted_text`
			`-- transcription_json`
			`FROM buscribe_transcriptions`
			`LEFT OUTER JOIN (`
			`SELECT line,`
			`ARRAY(`
			`SELECT speaker_name`
			`FROM buscribe_line_inferred_speakers AS inner_speakers`
			`WHERE inner_speakers.line = buscribe_line_inferred_speakers.line`
			`) AS names`
			`FROM buscribe_line_inferred_speakers`
			`) AS inferred_speakers ON id = inferred_speakers.line`
			`WHERE id IN (`
			`SELECT id`
			`FROM relevant_lines`
			`)`
			`AND start_time >= coalesce(%(start_time)s, '-infinity'::timestamp)`
			`AND end_time <= coalesce(%(end_time)s, 'infinity'::timestamp)`
			`)`
			`UNION`
			`(`
			`SELECT buscribe_transcriptions.id AS id,`
			`start_time,`
			`end_time,`
			`cverifier AS verifier,`
			`names,`
			`coalesce(`
			`verifications.verified_line,`
			`buscribe_transcriptions.transcription_line`
			`) AS transcription_line,`
			`ts_rank_cd(`
			`coalesce(`
			`setweight(to_tsvector('english', verified_line), 'C'),`
			`to_tsvector(`
			`'english',`
			`buscribe_transcriptions.transcription_line`
			`),`
			`''::tsvector`
			`) \|\| coalesce(`
			`setweight(to_tsvector(array_to_string(names, ' ')), 'C'),`
			`''::tsvector`
			`),`
			`(SELECT * FROM q)`
			`) AS rank,`
			`ts_headline(`
			`coalesce(`
			`verifications.verified_line,`
			`buscribe_transcriptions.transcription_line`
			`),`
			`(SELECT * FROM q),`
			`'StartSel=''<span class=\"highlight\">'', StopSel=</span>'`
			`) AS highlighted_text`
			`-- null AS transcription_json`
			`FROM buscribe_transcriptions`
			`INNER JOIN (`
			`SELECT *,`
			`coalesce(relevant_verified.line, relevant_speakers.line) AS cline,`
			`coalesce(`
			`relevant_verified.verifier,`
			`relevant_speakers.verifier`
			`) AS cverifier`
			`FROM (`
			`SELECT *`
			`FROM buscribe_verified_lines`
			`WHERE line IN (`
			`SELECT id`
			`FROM relevant_lines`
			`)`
			`) AS relevant_verified`
			`FULL OUTER JOIN (`
			`SELECT line,`
			`verifier,`
			`ARRAY(`
			`SELECT speaker_name`
			`FROM buscribe_line_speakers AS inner_speakers`
			`WHERE inner_speakers.line = buscribe_line_speakers.line`
			`AND inner_speakers.verifier = buscribe_line_speakers.verifier`
			`) AS names`
			`FROM buscribe_line_speakers`
			`WHERE line IN (`
			`SELECT id`
			`FROM relevant_lines`
			`)`
			`) AS relevant_speakers ON relevant_verified.line = relevant_speakers.line`
			`AND relevant_speakers.verifier = relevant_verified.verifier`
			`) AS verifications ON id = verifications.cline`
			`WHERE start_time >= coalesce(%(start_time)s, '-infinity'::timestamp)`
			`AND end_time <= coalesce(%(end_time)s, 'infinity'::timestamp)`
			`)`
			`)`
			`ORDER BY`
			`--rank DESC,`
			`start_time;`
			`"""`

			`db_results = database.query(db_conn, query,`
			`start_time=start_time if start_time is not None else '-infinity',`
			`end_time=end_time if end_time is not None else 'infinity',`
			`ts_query=ts_query)`

			`# Number of messages`
			`n_m = db_results.rowcount`

			`# Get duration for message frequency calculation`
			`bus_duration = database.query(db_conn,`
			`"""`
			`--sql`
			`SELECT EXTRACT(EPOCH FROM max(end_time) - min(start_time)) AS bus_duration FROM buscribe_transcriptions;`
			`"""`
			`).fetchone().bus_duration`

			`# Priors saying that an interesting event (p < 0.01) is when three messages per minute are posted, giving the`
			`# equivalent average 13 messages per minute`
			`a_p = 13.0`
			`b_p = 60.0`

			`# 0.02th quantile difference (cf. chat)`
			`p = 0.02`
			`l = b_p + bus_duration`
			`a = a_p + n_m`
result finalization 2 years ago
api init 2 years ago			`# Lomax distribution is posterior predictive for exponential`
			`message_duration_diff = l * ((1 - p)**-(1/a) - 1)`
result finalization 2 years ago
api init 2 years ago			`current_result = Result()`
			`results = []`
result finalization 2 years ago			`# print(message_duration_diff)`
api init 2 years ago			`for transcript_line in db_results:`

			`# Current result set is new`
			`if not current_result.transcript:`
			`current_result.transcript.append(transcript_line)`
			`# New message is within window`
			`elif (transcript_line.start_time - current_result.transcript[-1].end_time).total_seconds() <= message_duration_diff:`
result finalization 2 years ago			`# print((transcript_line.start_time -`
			`# current_result.transcript[-1].end_time).total_seconds())`
api init 2 years ago			`current_result.transcript.append(transcript_line)`
			`# New message is outside window`
			`else:`
			`# Always save the run (cf. chat; we save all lines)`
			`results.append(current_result)`
			`# Start new run`
			`current_result = Result(transcript=[transcript_line])`

			`results.append(current_result)`

			`return results`


			`def get_vst(db_conn, ts_query, start_time="-infinity", end_time="infinity"):`
			`query = """`
			`--sql`
			`SELECT`
			`video_ranges[1].start AS start_time,`
			`video_ranges[1].end AS end_time,`
			`video_title AS title,`
			`video_id AS id,`
			`ts_rank_cd(`
			`setweight(`
			`to_tsvector('english'::regconfig, video_title),`
			`'C'),`
			`websearch_to_tsquery(%(ts_query)s)`
			`) AS rank`
			`FROM events`
			`WHERE`
			`video_ranges[1].start >= %(start_time)s AND`
			`video_ranges[1].end <= %(end_time)s AND`
			`array_length(video_ranges, 1) = 1 AND -- Only handling single-cut videos for now`
			`to_tsvector('english'::regconfig, video_title) @@ websearch_to_tsquery(%(ts_query)s)`
			`ORDER BY`
			`--rank DESC,`
			`start_time`
			`LIMIT 100; -- Hard limit on result number`
			`"""`

			`results = database.query(db_conn, query,`
			`start_time=start_time if start_time is not None else '-infinity',`
			`end_time=end_time if end_time is not None else 'infinity',`
			`ts_query=ts_query)`

			`return [Result(vst=result) for result in results]`


			`def get_chat(db_conn, ts_query, start_time="-infinity", end_time="infinity"):`
			`query = """`
			`--sql`
			`SELECT * FROM (`
			`SELECT`
			`pub_time,`
			`content->'tags'->>'display-name' AS name,`
			`content->'params'->>1 AS content,`
			`ts_rank_cd(`
			`setweight(`
			`to_tsvector('english'::regconfig, content->'params'->>1),`
			`'A'),`
			`websearch_to_tsquery(%(ts_query)s)`
			`) AS rank`
			`FROM chat`
			`WHERE`
			`pub_time >= %(start_time)s AND`
			`pub_time <= %(end_time)s AND`
			`to_tsvector('english'::regconfig, content->'params'->>1) @@ websearch_to_tsquery(%(ts_query)s)`
			`--ORDER BY`
			`-- rank DESC`
			`--LIMIT 100 -- Hard limit on result number`
			`) AS chat ORDER BY pub_time; -- Re-sort by time for merging`
			`"""`

			`db_results = database.query(db_conn, query,`
			`start_time=start_time if start_time is not None else '-infinity',`
			`end_time=end_time if end_time is not None else 'infinity',`
			`ts_query=ts_query)`

			`# Number of messages`
			`n_m = db_results.rowcount`

			`# Get duration for message frequency calculation`
			`bus_duration = database.query(db_conn,`
			`"""`
			`--sql`
			`SELECT EXTRACT(EPOCH FROM max(pub_time) - min(pub_time)) AS bus_duration FROM chat;`
			`"""`
			`).fetchone().bus_duration`

			`# Priors saying that an interesting event (p < 0.01) is when three messages per minute are posted, giving the`
			`# equivalent average 13 messages per minute`
			`a_p = 13.0`
			`b_p = 60.0`

			`# 0.01th quantile difference`
			`p = 0.01`
			`l = b_p + bus_duration`
			`a = a_p + n_m`
			`# Lomax distribution is posterior predictive for exponential`
			`message_duration_diff = l * ((1 - p)**-(1/a) - 1)`

			`current_result = Result()`
			`results = []`
			`for chat_line in db_results:`
			`# Current result set is new`
			`if not current_result.chat:`
			`current_result.chat.append(chat_line)`
			`# New message is within window`
			`elif (chat_line.pub_time - current_result.chat[-1].pub_time).total_seconds() <= message_duration_diff:`
			`current_result.chat.append(chat_line)`
			`# New message is outside window`
			`else:`
			`# Current run has more than one message: save it`
			`if len(current_result.chat) > 1:`
			`results.append(current_result)`
			`# Start new run`
			`current_result = Result(chat=[chat_line])`

			`if len(current_result.chat) > 1:`
			`results.append(current_result)`

			`return results`


result finalization 2 years ago			`def load_results_data(db_conn, results):`
			`"""`
			`Replace chat and transcript with all entries in result's timeframe.`
			`"""`

			`# ggroup = Pool(size=30)`

			`# results = ggroup.map(lambda result: load_result_data(db_manager, result), results)`

			`result_timespans = [(i, result.start_time, result.end_time) for (i, result) in enumerate(results)]`

			`# Clear lists so we can later insert new lines`
			`for result in results:`
			`result.chat = []`
			`result.transcript = []`


			`cur = db_conn.cursor()`

			`execute_values(cur,`
			`"""`
			`--sql`
			`WITH timespans (id, start_time, end_time) AS (VALUES %s)`
			`SELECT`
			`timespans.id,`
			`pub_time,`
			`content->'tags'->>'display-name' AS name,`
			`content->'params'->>1 AS content FROM timespans JOIN chat ON (pub_time BETWEEN start_time AND end_time);`
			`""",`
			`result_timespans`
			`)`

			`for chat_line in cur:`
			`results[chat_line.id].chat.append(chat_line)`

			`execute_values(cur,`
			`"""`
			`--sql`
			`WITH timespans (id, start_time, end_time) AS (VALUES %s)`
			`SELECT`
			`timespans.id,`
			`buscribe_transcriptions.start_time,`
			`buscribe_transcriptions.end_time,`
			`names,`
			`buscribe_transcriptions.transcription_line`
			`FROM timespans JOIN`
			`buscribe_transcriptions ON (`
			`buscribe_transcriptions.start_time >= timespans.start_time AND buscribe_transcriptions.start_time <= timespans.end_time AND`
			`buscribe_transcriptions.end_time >= timespans.start_time AND buscribe_transcriptions.end_time <= timespans.end_time`
			`)`
			`LEFT OUTER JOIN (`
			`SELECT line,`
			`ARRAY(`
			`SELECT speaker_name`
			`FROM buscribe_line_inferred_speakers AS inner_speakers`
			`WHERE inner_speakers.line = buscribe_line_inferred_speakers.line`
			`) AS names`
			`FROM buscribe_line_inferred_speakers`
			`) AS inferred_speakers ON buscribe_transcriptions.id = inferred_speakers.line;`
			`""",`
			`result_timespans`
			`)`

			`for transcript_line in cur:`
			`results[transcript_line.id].transcript.append(transcript_line)`

			`return results`
api init 2 years ago

result finalization 2 years ago			`def merge_results(transcript: list[Result], vst: list[Result], chat: list[Result], limit: int, offset = 0):`
api init 2 years ago			`"""`
			`Merge different types of results in order of importance.`

			`First, merge anything that overlaps with a VST clip into the clip's result.`

			`Second, merge any chat the overlaps with transcripts into the transcript.`

			`Finally, append remaining chats.`
			`"""`

			`transcript_i = 0`
			`chat_i = 0`

			`# Merge transcript and chat into vst`
			`for vst_result in vst:`
			`while transcript_i < len(transcript) and transcript[transcript_i].start_time < vst_result.end_time:`
			`if overlap(vst_result, transcript[transcript_i]):`
			`vst_result.transcript.extend(`
			`transcript.pop(transcript_i).transcript)`
			`else:`
			`transcript_i += 1`

			`while chat_i < len(chat) and chat[chat_i].start_time < vst_result.end_time:`
			`if overlap(vst_result, chat[chat_i]):`
			`vst_result.chat.extend(chat.pop(chat_i).chat)`
			`else:`
			`chat_i += 1`

			`chat_i = 0`

			`# Merge chat into transcript`
			`for transcript_result in transcript:`
			`while chat_i < len(chat) and chat[chat_i].start_time < transcript_result.end_time:`
			`if overlap(transcript_result, chat[chat_i]):`
			`transcript_result.chat.extend(chat.pop(chat_i).chat)`
			`else:`
			`chat_i += 1`

			`merged = transcript + vst + chat`
			`merged.sort(key=lambda result: result.start_time)`
			`merged.sort(key=lambda result: result.weight, reverse=True)`
result finalization 2 years ago			`return merged[offset:min((offset + limit), len(merged))]`
api init 2 years ago

			`def overlap(result_a, result_b):`
			`"""`
			`A \|---------\|`
			`B \|---\|`

			`or`

			`A \|-----\|`
			`B \|---\|`

			`or`

			`A \|---\|`
			`B \|---\|`
			`"""`
			`return result_b.start_time >= result_a.start_time and result_b.start_time <= result_a.end_time or \`
			`result_b.end_time >= result_a.start_time and result_b.end_time <= result_a.end_time`