downloader: Add framework to allow alternate "providers" besides twitch

This abstracts out the process of obtaining media playlists so that we can support non-twitch
streaming services.
pull/355/head
Mike Lang 1 year ago committed by Mike Lang
parent 30c1877b36
commit bc08d97e56

@ -300,7 +300,7 @@
"--base-dir", "/mnt", "--base-dir", "/mnt",
"--qualities", std.join(",", $.qualities), "--qualities", std.join(",", $.qualities),
"--backdoor-port", std.toString($.backdoor_port), "--backdoor-port", std.toString($.backdoor_port),
] + if $.downloader_creds_file != null then ["--auth-file", "/token"] else [], ] + if $.downloader_creds_file != null then ["--twitch-auth-file", "/token"] else [],
// Mount the segments directory at /mnt // Mount the segments directory at /mnt
volumes: ["%s:/mnt" % $.segments_path] volumes: ["%s:/mnt" % $.segments_path]
+ if $.downloader_creds_file != null then ["%s:/token" % $.downloader_creds_file] else [], + if $.downloader_creds_file != null then ["%s:/token" % $.downloader_creds_file] else [],

@ -21,7 +21,7 @@ import common
import common.dateutil import common.dateutil
import common.requests import common.requests
from . import twitch from .twitch import URLProvider, TwitchProvider, YoutubeProvider
segments_downloaded = prom.Counter( segments_downloaded = prom.Counter(
@ -121,9 +121,9 @@ class StreamsManager(object):
FETCH_MIN_INTERVAL = 20 FETCH_MIN_INTERVAL = 20
FETCH_TIMEOUTS = 5, 30 FETCH_TIMEOUTS = 5, 30
MAX_WORKER_AGE = 20*60*60 # 20 hours, twitch's media playlist links expire after 24 hours
def __init__(self, channel, base_dir, qualities, important=False, auth_token=None): def __init__(self, provider, channel, base_dir, qualities, important=False):
self.provider = provider
self.channel = channel self.channel = channel
self.logger = logging.getLogger("StreamsManager({})".format(channel)) self.logger = logging.getLogger("StreamsManager({})".format(channel))
self.base_dir = base_dir self.base_dir = base_dir
@ -133,7 +133,6 @@ class StreamsManager(object):
self.refresh_needed = gevent.event.Event() # set to tell main loop to refresh now self.refresh_needed = gevent.event.Event() # set to tell main loop to refresh now
self.stopping = gevent.event.Event() # set to tell main loop to stop self.stopping = gevent.event.Event() # set to tell main loop to stop
self.important = important self.important = important
self.auth_token = auth_token
self.master_playlist_log_level = logging.INFO if important else logging.DEBUG self.master_playlist_log_level = logging.INFO if important else logging.DEBUG
if self.important: if self.important:
self.FETCH_MIN_INTERVAL = self.IMPORTANT_FETCH_MIN_INTERVAL self.FETCH_MIN_INTERVAL = self.IMPORTANT_FETCH_MIN_INTERVAL
@ -215,8 +214,7 @@ class StreamsManager(object):
self.logger.log(self.master_playlist_log_level, "Fetching master playlist") self.logger.log(self.master_playlist_log_level, "Fetching master playlist")
fetch_time = monotonic() fetch_time = monotonic()
with soft_hard_timeout(self.logger, "fetching master playlist", self.FETCH_TIMEOUTS, self.trigger_refresh): with soft_hard_timeout(self.logger, "fetching master playlist", self.FETCH_TIMEOUTS, self.trigger_refresh):
master_playlist = twitch.get_master_playlist(self.channel, auth_token=self.auth_token) new_urls = self.provider.get_media_playlist_uris(list(self.stream_workers.keys()))
new_urls = twitch.get_media_playlist_uris(master_playlist, list(self.stream_workers.keys()))
self.update_urls(fetch_time, new_urls) self.update_urls(fetch_time, new_urls)
for quality, workers in self.stream_workers.items(): for quality, workers in self.stream_workers.items():
# warn and retry if the url is missing # warn and retry if the url is missing
@ -231,7 +229,7 @@ class StreamsManager(object):
continue continue
latest_worker = workers[-1] latest_worker = workers[-1]
# is the old worker too old? # is the old worker too old?
if latest_worker.age() > self.MAX_WORKER_AGE: if latest_worker.age() > self.provider.MAX_WORKER_AGE:
self.logger.info("Starting new worker for {} as the latest is too old ({}h)".format(quality, latest_worker.age() / 3600.)) self.logger.info("Starting new worker for {} as the latest is too old ({}h)".format(quality, latest_worker.age() / 3600.))
self.start_worker(quality) self.start_worker(quality)
except Exception as e: except Exception as e:
@ -250,7 +248,7 @@ class StreamsManager(object):
while not self.stopping.is_set(): while not self.stopping.is_set():
# clamp time to max age to non-negative, and default to 0 if no workers exist # clamp time to max age to non-negative, and default to 0 if no workers exist
time_to_next_max_age = max(0, min([ time_to_next_max_age = max(0, min([
self.MAX_WORKER_AGE - workers[-1].age() self.provider.MAX_WORKER_AGE - workers[-1].age()
for workers in self.stream_workers.values() if workers for workers in self.stream_workers.values() if workers
] or [0])) ] or [0]))
self.logger.log(self.master_playlist_log_level, "Next master playlist refresh in at most {} sec".format(time_to_next_max_age)) self.logger.log(self.master_playlist_log_level, "Next master playlist refresh in at most {} sec".format(time_to_next_max_age))
@ -349,7 +347,7 @@ class StreamWorker(object):
self.logger.debug("Getting media playlist {}".format(self.url)) self.logger.debug("Getting media playlist {}".format(self.url))
try: try:
with soft_hard_timeout(self.logger, "getting media playlist", self.FETCH_TIMEOUTS, self.trigger_new_worker): with soft_hard_timeout(self.logger, "getting media playlist", self.FETCH_TIMEOUTS, self.trigger_new_worker):
playlist = twitch.get_media_playlist(self.url, session=self.session) playlist = self.manager.provider.get_media_playlist(self.url, session=self.session)
except Exception as e: except Exception as e:
self.logger.warning("Failed to fetch media playlist {}".format(self.url), exc_info=True) self.logger.warning("Failed to fetch media playlist {}".format(self.url), exc_info=True)
self.trigger_new_worker() self.trigger_new_worker()
@ -597,22 +595,38 @@ class SegmentGetter(object):
stat.set(max(stat._value.get(), timestamp)) # NOTE: not thread-safe but is gevent-safe stat.set(max(stat._value.get(), timestamp)) # NOTE: not thread-safe but is gevent-safe
@argh.arg('channels', nargs="+", help= def parse_channel(channel):
if ":" in channel:
channel, type, url = channel.split(":", 2)
else:
type = "twitch"
url = None
important = channel.endswith("!")
channel = channel.rstrip("!")
return channel, important, type, url
@argh.arg('channels', nargs="+", type=parse_channel, help=
"Twitch channels to watch. Add a '!' suffix to indicate they're expected to be always up. " "Twitch channels to watch. Add a '!' suffix to indicate they're expected to be always up. "
"This affects retry interval, error reporting and monitoring." "This affects retry interval, error reporting and monitoring. "
"Non-twitch URLs can also be given with the form CHANNEL[!]:TYPE:URL"
) )
def main(channels, base_dir=".", qualities="source", metrics_port=8001, backdoor_port=0, auth_file=None): def main(channels, base_dir=".", qualities="source", metrics_port=8001, backdoor_port=0, twitch_auth_file=None):
qualities = qualities.split(",") if qualities else [] qualities = qualities.split(",") if qualities else []
auth_token = None twitch_auth_token = None
if auth_file is not None: if twitch_auth_file is not None:
with open(auth_file) as f: with open(twitch_auth_file) as f:
auth_token = f.read().strip() twitch_auth_token = f.read().strip()
managers = [ managers = []
StreamsManager(channel.rstrip('!'), base_dir, qualities, important=channel.endswith('!'), auth_token=auth_token) for channel, important, type, url in channels:
for channel in channels if type == "twitch":
] provider = TwitchProvider(channel, auth_token=twitch_auth_token)
else:
raise ValueError(f"Unknown type {type!r}")
manager = StreamsManager(provider, channel, base_dir, qualities, important=important)
managers.append(manager)
def stop(): def stop():
for manager in managers: for manager in managers:

@ -7,121 +7,138 @@ from common.requests import InstrumentedSession
from . import hls_playlist from . import hls_playlist
logger = logging.getLogger(__name__) class Provider:
"""Base class with defaults, to be overriden for specific providers"""
def get_access_token(channel, session, auth_token): # How long (in seconds) we should keep using a media playlist URI before getting a new one.
request = { # This matters because some providers set an expiry on the URI they give you.
"operationName": "PlaybackAccessToken", # However the default is an arbitrarily long period (ie. never).
"extensions": { MAX_WORKER_AGE = 30 * 24 * 60 * 60 # 30 days
"persistedQuery": {
"version": 1, def get_media_playlist_uris(self, qualities, session=None):
"sha256Hash": "0828119ded1c13477966434e15800ff57ddacf13ba1911c129dc2200705b0712" """Fetches master playlist and returns {quality: media playlist URI} for each
requested quality."""
raise NotImplementedError
def get_media_playlist(self, uri, session=None):
"""Fetches the given media playlist. In most cases this is just a simple fetch
and doesn't need to be overriden."""
if session is None:
session = InstrumentedSession()
resp = session.get(uri, metric_name='get_media_playlist')
resp.raise_for_status()
return hls_playlist.load(resp.text, base_uri=resp.url)
class TwitchProvider(Provider):
"""Provider that takes a twitch channel."""
# Twitch links expire after 24h, so roll workers at 20h
MAX_WORKER_AGE = 20 * 60 * 60
def __init__(self, channel, auth_token=None):
self.channel = channel
self.auth_token = auth_token
def get_access_token(self, session):
request = {
"operationName": "PlaybackAccessToken",
"extensions": {
"persistedQuery": {
"version": 1,
"sha256Hash": "0828119ded1c13477966434e15800ff57ddacf13ba1911c129dc2200705b0712"
}
},
"variables": {
"isLive": True,
"login": self.channel,
"isVod": False,
"vodID": "",
"playerType": "site"
} }
},
"variables": {
"isLive": True,
"login": channel,
"isVod": False,
"vodID": "",
"playerType": "site"
} }
} headers = {'Client-ID': 'kimne78kx3ncx6brgo4mv6wki5h1ko'}
headers = {'Client-ID': 'kimne78kx3ncx6brgo4mv6wki5h1ko'} if self.auth_token is not None:
if auth_token is not None: headers["Authorization"] = "OAuth {}".format(self.auth_token)
headers["Authorization"] = "OAuth {}".format(auth_token) resp = session.post(
resp = session.post( "https://gql.twitch.tv/gql",
"https://gql.twitch.tv/gql", json=request,
json=request, headers=headers,
headers=headers, metric_name='twitch_get_access_token',
metric_name='get_access_token', )
) resp.raise_for_status()
resp.raise_for_status() data = resp.json()["data"]["streamPlaybackAccessToken"]
data = resp.json()["data"]["streamPlaybackAccessToken"] return data['signature'], data['value']
return data['signature'], data['value']
def get_master_playlist(self, session):
sig, token = self.get_access_token(session)
def get_master_playlist(channel, session=None, auth_token=None): resp = session.get(
"""Get the master playlist for given channel from twitch""" "https://usher.ttvnw.net/api/channel/hls/{}.m3u8".format(self.channel),
if session is None: headers={
session = InstrumentedSession() "referer": "https://player.twitch.tv",
sig, token = get_access_token(channel, session, auth_token) "origin": "https://player.twitch.tv",
resp = session.get( },
"https://usher.ttvnw.net/api/channel/hls/{}.m3u8".format(channel), params={
headers={ # Taken from streamlink. Unsure what's needed and what changing things can do.
"referer": "https://player.twitch.tv", "player": "twitchweb",
"origin": "https://player.twitch.tv", "p": random.randrange(1000000),
}, "type": "any",
params={ "allow_source": "true",
# Taken from streamlink. Unsure what's needed and what changing things can do. "allow_audio_only": "true",
"player": "twitchweb", "allow_spectre": "false",
"p": random.randrange(1000000), "fast_bread": "true",
"type": "any", "sig": sig,
"allow_source": "true", "token": token,
"allow_audio_only": "true", },
"allow_spectre": "false", metric_name='twitch_get_master_playlist',
"fast_bread": "true", )
"sig": sig, resp.raise_for_status() # getting master playlist
"token": token, playlist = hls_playlist.load(resp.text, base_uri=resp.url)
}, return playlist
metric_name='get_master_playlist',
) def get_media_playlist_uris(self, target_qualities, session=None):
resp.raise_for_status() # getting master playlist # Twitch master playlists are observed to have the following form:
playlist = hls_playlist.load(resp.text, base_uri=resp.url) # The first listed variant is the source playlist and has "(source)" in the name.
return playlist # Other variants are listed in order of quality from highest to lowest, followed by audio_only.
# These transcoded variants are named "Hp[R]" where H is the vertical resolution and
# optionally R is the frame rate. R is elided if == 30. Examples: 720p60, 720p, 480p, 360p, 160p
def get_media_playlist_uris(master_playlist, target_qualities): # These variants are observed to only ever have one rendition, type video, which contains the name
"""From a master playlist, extract URIs of media playlists of interest. # but no URI. The URI in the main variant entry is the one to use. This is true even of the
Returns {stream name: uri}. # "audio_only" stream.
Note this is not a general method for all HLS streams, and makes twitch-specific assumptions, # Streams without transcoding options only show source and audio_only.
though we try to check and emit warnings if these assumptions are broken. # We return the source stream in addition to any in target_qualities that is found.
"""
# Twitch master playlists are observed to have the following form: logger = logging.getLogger("twitch")
# The first listed variant is the source playlist and has "(source)" in the name. if session is None:
# Other variants are listed in order of quality from highest to lowest, followed by audio_only. session = InstrumentedSession()
# These transcoded variants are named "Hp[R]" where H is the vertical resolution and
# optionally R is the frame rate. R is elided if == 30. Examples: 720p60, 720p, 480p, 360p, 160p master_playlist = self.get_master_playlist(session)
# These variants are observed to only ever have one rendition, type video, which contains the name
# but no URI. The URI in the main variant entry is the one to use. This is true even of the def variant_name(variant):
# "audio_only" stream. names = set(media.name for media in variant.media if media.type == "VIDEO" and media.name)
# Streams without transcoding options only show source and audio_only. if not names:
# We return the source stream in addition to any in target_qualities that is found. logger.warning("Variant {} has no named video renditions, can't determine name".format(variant))
return None
def variant_name(variant): if len(names) > 1:
names = set(media.name for media in variant.media if media.type == "VIDEO" and media.name) logger.warning("Variant {} has multiple possible names, picking one arbitrarily".format(variant))
if not names: return list(names)[0]
logger.warning("Variant {} has no named video renditions, can't determine name".format(variant))
return None if not master_playlist.playlists:
if len(names) > 1: raise ValueError("Master playlist has no variants")
logger.warning("Variant {} has multiple possible names, picking one arbitrarily".format(variant))
return list(names)[0] for variant in master_playlist.playlists:
if any(media.uri for media in variant.media):
if not master_playlist.playlists: logger.warning("Variant has a rendition with its own URI: {}".format(variant))
raise ValueError("Master playlist has no variants")
by_name = {variant_name(variant): variant for variant in master_playlist.playlists}
for variant in master_playlist.playlists:
if any(media.uri for media in variant.media): source_candidates = [name for name in by_name.keys() if "(source)" in name]
logger.warning("Variant has a rendition with its own URI: {}".format(variant)) if len(source_candidates) != 1:
raise ValueError("Can't find source stream, not exactly one candidate. Candidates: {}, playlist: {}".format(
by_name = {variant_name(variant): variant for variant in master_playlist.playlists} source_candidates, master_playlist,
))
source_candidates = [name for name in by_name.keys() if "(source)" in name] source = by_name[source_candidates[0]]
if len(source_candidates) != 1:
raise ValueError("Can't find source stream, not exactly one candidate. Candidates: {}, playlist: {}".format( variants = {name: variant for name, variant in by_name.items() if name in target_qualities}
source_candidates, master_playlist, variants["source"] = source
))
source = by_name[source_candidates[0]] return {name: variant.uri for name, variant in variants.items()}
variants = {name: variant for name, variant in by_name.items() if name in target_qualities}
variants["source"] = source
return {name: variant.uri for name, variant in variants.items()}
def get_media_playlist(uri, session=None):
if session is None:
session = InstrumentedSession()
resp = session.get(uri, metric_name='get_media_playlist')
resp.raise_for_status()
return hls_playlist.load(resp.text, base_uri=resp.url)

Loading…
Cancel
Save