[mediasite] Improve extraction and code style, add support for DASH (closes #11185, closes #14343, refs #5428)

pull/2/head
Sergey M․ 7 years ago
parent 8056c8542d
commit 2ca7ed41fe
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

@ -100,6 +100,7 @@ from .megaphone import MegaphoneIE
from .vzaar import VzaarIE from .vzaar import VzaarIE
from .channel9 import Channel9IE from .channel9 import Channel9IE
from .vshare import VShareIE from .vshare import VShareIE
from .mediasite import MediasiteIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1925,6 +1926,18 @@ class GenericIE(InfoExtractor):
'title': 'vl14062007715967', 'title': 'vl14062007715967',
'ext': 'mp4', 'ext': 'mp4',
} }
},
{
'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/',
'md5': 'aecd089f55b1cb5a59032cb049d3a356',
'info_dict': {
'id': '90227f51a80c4d8f86c345a7fa62bd9a1d',
'ext': 'mp4',
'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare',
'description': 'md5:5a51db84a62def7b7054df2ade403c6c',
'timestamp': 1474354800,
'upload_date': '20160920',
}
} }
# { # {
# # TODO: find another test # # TODO: find another test
@ -2884,14 +2897,14 @@ class GenericIE(InfoExtractor):
vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) vshare_urls, video_id, video_title, ie=VShareIE.ie_key())
# Look for Mediasite embeds # Look for Mediasite embeds
mobj = re.search(r'''(?xi) mediasite_urls = MediasiteIE._extract_urls(webpage)
<iframe[^>]+src="((?:https?://[a-z0-9\-\.:\[\]]+)? if mediasite_urls:
/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)" entries = [
''', webpage) self.url_result(smuggle_url(
if mobj is not None: compat_urlparse.urljoin(url, mediasite_url),
return self.url_result(smuggle_url( {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
compat_urlparse.urljoin(url, unescapeHTML(mobj.group(1))), for mediasite_url in mediasite_urls]
{ 'UrlReferrer': url }), 'Livestream') return self.playlist_result(entries, video_id, video_title)
def merge_dicts(dict1, dict2): def merge_dicts(dict1, dict2):
merged = {} merged = {}

@ -5,21 +5,22 @@ import re
import json import json
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urlparse from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
unsmuggle_url,
mimetype2ext,
float_or_none, float_or_none,
mimetype2ext,
unescapeHTML,
unsmuggle_url,
urljoin,
) )
class MediasiteIE(InfoExtractor): class MediasiteIE(InfoExtractor):
_VALID_URL = r'''(?xi) _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P<id>[0-9a-f]{32,34})(?P<query>\?[^#]+|)'
https?://[a-z0-9\-\.:\[\]]+/Mediasite/Play/
(?P<id>[0-9a-f]{32,34})
(?P<QueryString>\?[^#]+|)
'''
_TESTS = [ _TESTS = [
{ {
'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d',
@ -94,59 +95,88 @@ class MediasiteIE(InfoExtractor):
5: 'video3', 5: 'video3',
} }
@staticmethod
def _extract_urls(webpage):
return [
unescapeHTML(mobj.group('url'))
for mobj in re.finditer(
r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
url, data = unsmuggle_url(url, {}) url, data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
ResourceId = mobj.group('id') resource_id = mobj.group('id')
QueryString = mobj.group('QueryString') query = mobj.group('query')
webpage = self._download_webpage(url, ResourceId) # XXX: add UrlReferrer? webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer?
redirect_url = compat_str(urlh.geturl())
# XXX: might have also extracted UrlReferrer and QueryString from the html # XXX: might have also extracted UrlReferrer and QueryString from the html
ServicePath = compat_urlparse.urljoin(url, self._html_search_regex( service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex(
r'<div id="ServicePath">(.+?)</div>', webpage, ResourceId, r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id,
default='/Mediasite/PlayerService/PlayerService.svc/json')) default='/Mediasite/PlayerService/PlayerService.svc/json'))
PlayerOptions = self._download_json( player_options = self._download_json(
'%s/GetPlayerOptions' % (ServicePath), ResourceId, '%s/GetPlayerOptions' % service_path, resource_id,
headers={ headers={
'Content-type': 'application/json; charset=utf-8', 'Content-type': 'application/json; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
}, },
data=json.dumps({ data=json.dumps({
'getPlayerOptionsRequest': { 'getPlayerOptionsRequest': {
'ResourceId': ResourceId, 'ResourceId': resource_id,
'QueryString': QueryString, 'QueryString': query,
'UrlReferrer': data.get('UrlReferrer', ''), 'UrlReferrer': data.get('UrlReferrer', ''),
'UseScreenReader': False, 'UseScreenReader': False,
} }
}).encode('utf-8')) }).encode('utf-8'))['d']
Presentation = PlayerOptions['d']['Presentation']
if Presentation is None: presentation = player_options['Presentation']
raise ExtractorError('Mediasite says: %s' % title = presentation['Title']
(PlayerOptions['d']['PlayerPresentationStatusMessage'],),
if presentation is None:
raise ExtractorError(
'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'],
expected=True) expected=True)
thumbnails = [] thumbnails = []
formats = [] formats = []
for snum, Stream in enumerate(Presentation['Streams']): for snum, Stream in enumerate(presentation['Streams']):
stream_type = self._STREAM_TYPES.get( stream_type = Stream.get('StreamType')
Stream['StreamType'], 'type%u' % Stream['StreamType']) if stream_type is None:
continue
video_urls = Stream.get('VideoUrls')
if not isinstance(video_urls, list):
video_urls = []
stream_id = self._STREAM_TYPES.get(
stream_type, 'type%u' % stream_type)
stream_formats = [] stream_formats = []
for unum, VideoUrl in enumerate(Stream['VideoUrls']): for unum, VideoUrl in enumerate(video_urls):
url = VideoUrl['Location'] video_url = VideoUrl.get('Location')
if not video_url or not isinstance(video_url, compat_str):
continue
# XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS
if VideoUrl['MediaType'] == 'SS': media_type = VideoUrl.get('MediaType')
if media_type == 'SS':
stream_formats.extend(self._extract_ism_formats( stream_formats.extend(self._extract_ism_formats(
url, ResourceId, ism_id='%s-%u.%u' % (stream_type, snum, unum))) video_url, resource_id,
continue ism_id='%s-%u.%u' % (stream_id, snum, unum),
fatal=False))
elif media_type == 'Dash':
stream_formats.extend(self._extract_mpd_formats(
video_url, resource_id,
mpd_id='%s-%u.%u' % (stream_id, snum, unum),
fatal=False))
else:
stream_formats.append({ stream_formats.append({
'format_id': '%s-%u.%u' % (stream_type, snum, unum), 'format_id': '%s-%u.%u' % (stream_id, snum, unum),
'url': url, 'url': video_url,
'ext': mimetype2ext(VideoUrl['MimeType']), 'ext': mimetype2ext(VideoUrl.get('MimeType')),
}) })
# TODO: if Stream['HasSlideContent']: # TODO: if Stream['HasSlideContent']:
@ -155,16 +185,16 @@ class MediasiteIE(InfoExtractor):
# this will require writing a custom downloader... # this will require writing a custom downloader...
# disprefer 'secondary' streams # disprefer 'secondary' streams
if Stream['StreamType'] != 0: if stream_type != 0:
for fmt in stream_formats: for fmt in stream_formats:
fmt['preference'] = -1 fmt['preference'] = -1
ThumbnailUrl = Stream.get('ThumbnailUrl') thumbnail_url = Stream.get('ThumbnailUrl')
if ThumbnailUrl: if thumbnail_url:
thumbnails.append({ thumbnails.append({
'id': '%s-%u' % (stream_type, snum), 'id': '%s-%u' % (stream_id, snum),
'url': compat_urlparse.urljoin(url, ThumbnailUrl), 'url': urljoin(redirect_url, thumbnail_url),
'preference': -1 if Stream['StreamType'] != 0 else 0, 'preference': -1 if stream_type != 0 else 0,
}) })
formats.extend(stream_formats) formats.extend(stream_formats)
@ -174,11 +204,11 @@ class MediasiteIE(InfoExtractor):
# XXX: Presentation['Transcript'] # XXX: Presentation['Transcript']
return { return {
'id': ResourceId, 'id': resource_id,
'title': Presentation['Title'], 'title': title,
'description': Presentation.get('Description'), 'description': presentation.get('Description'),
'duration': float_or_none(Presentation.get('Duration'), 1000), 'duration': float_or_none(presentation.get('Duration'), 1000),
'timestamp': float_or_none(Presentation.get('UnixTime'), 1000), 'timestamp': float_or_none(presentation.get('UnixTime'), 1000),
'formats': formats, 'formats': formats,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
} }

Loading…
Cancel
Save