diff --git a/README.md b/README.md index 3e0db98b8..ecadd622f 100644 --- a/README.md +++ b/README.md @@ -734,6 +734,7 @@ Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format + - `language`: Language code Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 651ad0428..eddd1f1b3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -103,6 +103,7 @@ - **BilibiliAudioAlbum** - **BiliBiliPlayer** - **BioBioChileTV** + - **Biography** - **BIQLE** - **BitChute** - **BitChuteChannel** @@ -197,7 +198,6 @@ - **CrooksAndLiars** - **crunchyroll** - **crunchyroll:playlist** - - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **CTV** @@ -354,6 +354,7 @@ - **hgtv.com:show** - **HiDive** - **HistoricFilms** + - **history:player** - **history:topic**: History.com Topic - **hitbox** - **hitbox:live** @@ -1099,6 +1100,7 @@ - **vube**: Vube.com - **VuClip** - **VVVVID** + - **VVVVIDShow** - **VyboryMos** - **Vzaar** - **Wakanim** diff --git a/test/test_utils.py b/test/test_utils.py index 16ad40831..f519a4489 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -554,6 +554,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('http$://foo.de'), None) self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') self.assertEqual(url_or_none('//foo.de'), '//foo.de') + self.assertEqual(url_or_none('s3://foo.de'), None) + self.assertEqual(url_or_none('rtmpte://foo.de'), 'rtmpte://foo.de') + self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') + self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') + self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 32766ea61..11d8031ff 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -1138,7 +1138,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol|format_id) + \s*(?Pext|acodec|vcodec|container|protocol|format_id|language) \s*(?P!\s*)?(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ diff --git a/youtube_dlc/extractor/aenetworks.py b/youtube_dlc/extractor/aenetworks.py index 3d0cf1208..8e4963131 100644 --- a/youtube_dlc/extractor/aenetworks.py +++ b/youtube_dlc/extractor/aenetworks.py @@ -6,6 +6,7 @@ import re from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, update_url_query, urlencode_postdata, @@ -28,6 +29,7 @@ class AENetworksBaseIE(ThePlatformIE): 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), 'fyi.tv': ('FYI', 'fyi'), 'historyvault.com': (None, 'historyvault'), + 'biography.com': (None, 'biography'), } def _extract_aen_smil(self, smil_url, video_id, auth=None): @@ -54,6 +56,8 @@ class AENetworksBaseIE(ThePlatformIE): tp_formats, tp_subtitles = self._extract_theplatform_smil( m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) except ExtractorError as e: + if isinstance(e, GeoRestrictedError): + raise last_e = e continue formats.extend(tp_formats) @@ -67,6 +71,34 @@ class AENetworksBaseIE(ThePlatformIE): 'subtitles': subtitles, } + def _extract_aetn_info(self, domain, filter_key, filter_value, url): + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) + return info + class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' @@ -139,32 +171,7 @@ class AENetworksIE(AENetworksBaseIE): def _real_extract(self, url): domain, canonical = re.match(self._VALID_URL, url).groups() - requestor_id, brand = self._DOMAIN_MAP[domain] - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - canonical, query={'filter[canonical]': '/' + canonical})['results'][0] - title = result['title'] - video_id = result['id'] - media_url = result['publicUrl'] - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - auth = None - if theplatform_metadata.get('AETN$isBehindWall'): - resource = self._get_mvpd_resource( - requestor_id, theplatform_metadata['title'], - theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - info.update(self._extract_aen_smil(media_url, video_id, auth)) - info.update({ - 'title': title, - 'series': result.get('seriesName'), - 'season_number': int_or_none(result.get('tvSeasonNumber')), - 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), - }) - return info + return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) class AENetworksListBaseIE(AENetworksBaseIE): @@ -294,3 +301,42 @@ class HistoryTopicIE(AENetworksBaseIE): return self.url_result( 'http://www.history.com/videos/' + display_id, AENetworksIE.ie_key()) + + +class HistoryPlayerIE(AENetworksBaseIE): + IE_NAME = 'history:player' + _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'id', video_id, url) + + +class BiographyIE(AENetworksBaseIE): + _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', + 'info_dict': { + 'id': '30322987', + 'ext': 'mp4', + 'title': 'Vincent Van Gogh - Full Episode', + 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', + 'timestamp': 1311970571, + 'upload_date': '20110729', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_url = self._search_regex( + r']+src="(%s)' % HistoryPlayerIE._VALID_URL, + webpage, 'player URL') + return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py index 1a2e7c62a..6022076ac 100644 --- a/youtube_dlc/extractor/brightcove.py +++ b/youtube_dlc/extractor/brightcove.py @@ -471,18 +471,18 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() + num_drm_sources = 0 formats = [] - sources_num = len(json_data.get('sources')) - key_systems_present = 0 - for source in json_data.get('sources', []): + sources = json_data.get('sources') or [] + for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - # https://apis.support.brightcove.com/playback/references/playback-api-video-fields-reference.html - if source.get('key_systems'): - key_systems_present += 1 + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if container == 'WVM' or source.get('key_systems'): + num_drm_sources += 1 continue - elif ext == 'ism' or container == 'WVM': + elif ext == 'ism': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -540,14 +540,14 @@ class BrightcoveNewIE(AdobePassIE): }) formats.append(f) - if sources_num == key_systems_present: - raise ExtractorError('This video is DRM protected', expected=True) - - errors = json_data.get('errors') - if not formats and errors: - error = errors[0] - raise ExtractorError( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if sources and num_drm_sources == len(sources): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 135951e57..3b762541f 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -33,6 +33,8 @@ from .aenetworks import ( AENetworksCollectionIE, AENetworksShowIE, HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, ) from .afreecatv import AfreecaTVIE from .airmozilla import AirMozillaIE @@ -716,7 +718,6 @@ from .nba import ( NBAChannelIE, ) from .nbc import ( - CSNNEIE, NBCIE, NBCNewsIE, NBCOlympicsIE, @@ -1461,7 +1462,10 @@ from .vshare import VShareIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE -from .vvvvid import VVVVIDIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) from .vyborymos import VyboryMosIE from .vzaar import VzaarIE from .wakanim import WakanimIE diff --git a/youtube_dlc/extractor/nbc.py b/youtube_dlc/extractor/nbc.py index ea5f5a315..0d77648c2 100644 --- a/youtube_dlc/extractor/nbc.py +++ b/youtube_dlc/extractor/nbc.py @@ -158,7 +158,8 @@ class NBCIE(AdobePassIE): class NBCSportsVPlayerIE(InfoExtractor): - _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' + _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' + _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -174,12 +175,15 @@ class NBCSportsVPlayerIE(InfoExtractor): }, { 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, + }, { + 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): iframe_m = re.search( - r']+src="(?Phttps?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) if iframe_m: return iframe_m.group('url') @@ -192,21 +196,29 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): - # Does not include https because its certificate is invalid - _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P[0-9a-z-]+)' - _TEST = { + _TESTS = [{ + # iframe src 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', 'info_dict': { 'id': 'PHJSaFWbrTY9', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, } - } + }, { + # data-mpx-src + 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', + 'only_matching': True, + }, { + # data-src + 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -274,33 +286,6 @@ class NBCSportsStreamIE(AdobePassIE): } -class CSNNEIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' - - _TEST = { - 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', - 'info_dict': { - 'id': 'yvBLLUgQ8WU0', - 'ext': 'mp4', - 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', - 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', - 'timestamp': 1459369979, - 'upload_date': '20160330', - 'uploader': 'NBCU-SPORTS', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': self._html_search_meta('twitter:player:stream', webpage), - 'display_id': display_id, - } - - class NBCNewsIE(ThePlatformIE): _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' diff --git a/youtube_dlc/extractor/reddit.py b/youtube_dlc/extractor/reddit.py index 5a9a3b24b..77f66c966 100644 --- a/youtube_dlc/extractor/reddit.py +++ b/youtube_dlc/extractor/reddit.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, float_or_none, try_get, + unescapeHTML, url_or_none, ) @@ -56,7 +57,8 @@ class RedditRIE(InfoExtractor): 'id': 'zv89llsvexdz', 'ext': 'mp4', 'title': 'That small heart attack.', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:4', 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', @@ -118,11 +120,34 @@ class RedditRIE(InfoExtractor): else: age_limit = None + thumbnails = [] + + def add_thumbnail(src): + if not isinstance(src, dict): + return + thumbnail_url = url_or_none(src.get('url')) + if not thumbnail_url: + return + thumbnails.append({ + 'url': unescapeHTML(thumbnail_url), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + }) + + for image in try_get(data, lambda x: x['preview']['images']) or []: + if not isinstance(image, dict): + continue + add_thumbnail(image.get('source')) + resolutions = image.get('resolutions') + if isinstance(resolutions, list): + for resolution in resolutions: + add_thumbnail(resolution) + return { '_type': 'url_transparent', 'url': video_url, 'title': data.get('title'), - 'thumbnail': url_or_none(data.get('thumbnail')), + 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), 'duration': int_or_none(try_get( diff --git a/youtube_dlc/extractor/sevenplus.py b/youtube_dlc/extractor/sevenplus.py index 84568ac69..240afc18f 100644 --- a/youtube_dlc/extractor/sevenplus.py +++ b/youtube_dlc/extractor/sevenplus.py @@ -4,8 +4,12 @@ from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, try_get, update_url_query, ) @@ -41,16 +45,22 @@ class SevenPlusIE(BrightcoveNewIE): def _real_extract(self, url): path, episode_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - 'https://videoservice.swm.digital/playback', episode_id, query={ - 'appId': '7plus', - 'deviceType': 'web', - 'platformType': 'web', - 'accountId': 5303576322001, - 'referenceId': 'ref:' + episode_id, - 'deliveryId': 'csai', - 'videoType': 'vod', - })['media'] + try: + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + raise for source in media.get('sources', {}): src = source.get('src') diff --git a/youtube_dlc/extractor/tenplay.py b/youtube_dlc/extractor/tenplay.py index af325fea8..cd30d57f4 100644 --- a/youtube_dlc/extractor/tenplay.py +++ b/youtube_dlc/extractor/tenplay.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + HEADRequest, parse_age_limit, parse_iso8601, - smuggle_url, + # smuggle_url, ) @@ -24,14 +25,16 @@ class TenPlayIE(InfoExtractor): 'uploader_id': '2199827728001', }, 'params': { - 'format': 'bestvideo', + # 'format': 'bestvideo', 'skip_download': True, } }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + _GEO_BYPASS = False + _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect' def _real_extract(self, url): content_id = self._match_id(url) @@ -40,19 +43,28 @@ class TenPlayIE(InfoExtractor): video = data.get('video') or {} metadata = data.get('metaData') or {} brightcove_id = video.get('videoId') or metadata['showContentVideoId'] - brightcove_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['AU']}) + # brightcove_url = smuggle_url( + # self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + # {'geo_countries': ['AU']}) + m3u8_url = self._request_webpage(HEADRequest( + self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl() + if '10play-not-in-oz' in m3u8_url: + self.raise_geo_restricted(countries=['AU']) + formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4') + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': brightcove_url, - 'id': content_id, - 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'), + # '_type': 'url_transparent', + # 'url': brightcove_url, + 'formats': formats, + 'id': brightcove_id, + 'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'], 'description': video.get('description'), 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), 'series': metadata.get('showName'), 'season': metadata.get('showContentSeason'), 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), - 'ie_key': 'BrightcoveNew', + 'thumbnail': video.get('poster'), + 'uploader_id': '2199827728001', + # 'ie_key': 'BrightcoveNew', } diff --git a/youtube_dlc/extractor/uktvplay.py b/youtube_dlc/extractor/uktvplay.py index 2137502a1..f28fd514d 100644 --- a/youtube_dlc/extractor/uktvplay.py +++ b/youtube_dlc/extractor/uktvplay.py @@ -5,10 +5,9 @@ from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P\d+)' - _TEST = { + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P\d+)' + _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', - 'md5': '', 'info_dict': { 'id': '2117008346001', 'ext': 'mp4', @@ -23,7 +22,11 @@ class UKTVPlayIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Failed to download MPD manifest'] - } + }, { + 'url': 'https://uktvplay.uktv.co.uk/shows/africa/watch-online/5983349675001', + 'only_matching': True, + }] + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/1242911124001/OrCyvJ2gyL_default/index.html?videoId=%s' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dlc/extractor/vvvvid.py b/youtube_dlc/extractor/vvvvid.py index 6906cd2ab..014a67e53 100644 --- a/youtube_dlc/extractor/vvvvid.py +++ b/youtube_dlc/extractor/vvvvid.py @@ -12,7 +12,8 @@ from ..utils import ( class VVVVIDIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' + _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/' + _VALID_URL = r'%s(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' % _VALID_URL_BASE _TESTS = [{ # video_type == 'video/vvvvid' 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', @@ -21,6 +22,16 @@ class VVVVIDIE(InfoExtractor): 'id': '489048', 'ext': 'mp4', 'title': 'Ping Pong', + 'duration': 239, + 'series': '"Perché dovrei guardarlo?" di Dario Moccia', + 'season_id': '437', + 'season_number': 1, + 'episode': 'Ping Pong', + 'episode_number': 1, + 'episode_id': '3334', + 'view_count': int, + 'like_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, @@ -37,6 +48,9 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', + 'only_matching': True }] _conn_id = None @@ -45,20 +59,36 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _real_extract(self, url): - show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), + 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, - }) - if response['result'] == 'error': + }, fatal=fatal) + if not (response or fatal): + return + if response.get('result') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) + return response['data'] + + def _extract_common_video_info(self, video_data): + return { + 'thumbnail': video_data.get('thumbnail'), + 'episode_number': int_or_none(video_data.get('number')), + 'episode_id': str_or_none(video_data.get('id')), + } + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, 'season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( - lambda episode: episode.get('video_id') == vid, response['data']))[0] + lambda episode: episode.get('video_id') == vid, response))[0] + title = video_data['title'] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -141,18 +171,67 @@ class VVVVIDIE(InfoExtractor): 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) self._sort_formats(formats) - return { + info = self._extract_common_video_info(video_data) + info.update({ 'id': video_id, - 'title': video_data['title'], + 'title': title, 'formats': formats, - 'thumbnail': video_data.get('thumbnail'), 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, 'season_number': video_data.get('season_number'), - 'episode_id': str_or_none(video_data.get('id')), - 'episode_number': int_or_none(video_data.get('number')), - 'episode_title': video_data['title'], + 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), - } + 'repost_count': int_or_none(video_data.get('video_shares')), + }) + return info + + +class VVVVIDShowIE(VVVVIDIE): + _VALID_URL = r'(?P%s(?P\d+)(?:/(?P[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.vvvvid.it/show/156/psyco-pass', + 'info_dict': { + 'id': '156', + 'title': 'Psycho-Pass', + 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', + }, + 'playlist_count': 46, + }, { + 'url': 'https://www.vvvvid.it/show/156', + 'only_matching': True, + }] + + def _real_extract(self, url): + base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() + + seasons = self._download_info( + show_id, 'seasons/', show_title) + + show_info = self._download_info( + show_id, 'info/', show_title, fatal=False) + + entries = [] + for season in (seasons or []): + season_number = int_or_none(season.get('number')) + episodes = season.get('episodes') or [] + for episode in episodes: + season_id = str_or_none(episode.get('season_id')) + video_id = str_or_none(episode.get('video_id')) + if not (season_id and video_id): + continue + info = self._extract_common_video_info(episode) + info.update({ + '_type': 'url', + 'ie_key': VVVVIDIE.ie_key(), + 'url': '/'.join([base_url, season_id, video_id]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'season_number': season_number, + 'season_id': season_id, + }) + entries.append(info) + + return self.playlist_result( + entries, show_id, show_info.get('title'), show_info.get('description')) diff --git a/youtube_dlc/extractor/yandexdisk.py b/youtube_dlc/extractor/yandexdisk.py index e8f6ae10f..6fcd8ee7e 100644 --- a/youtube_dlc/extractor/yandexdisk.py +++ b/youtube_dlc/extractor/yandexdisk.py @@ -1,23 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, + mimetype2ext, try_get, - urlencode_postdata, + urljoin, ) class YandexDiskIE(InfoExtractor): - _VALID_URL = r'https?://yadi\.sk/[di]/(?P[^/?#&]+)' + _VALID_URL = r'''(?x)https?:// + (?P + yadi\.sk| + disk\.yandex\. + (?: + az| + by| + co(?:m(?:\.(?:am|ge|tr))?|\.il)| + ee| + fr| + k[gz]| + l[tv]| + md| + t[jm]| + u[az]| + ru + ) + )/(?:[di]/|public.*?\bhash=)(?P[^/?#&]+)''' _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'md5': 'a4a8d52958c8fddcf9845935070402ae', 'info_dict': { 'id': 'VdOeDou8eZs6Y', 'ext': 'mp4', @@ -27,92 +47,101 @@ class YandexDiskIE(InfoExtractor): 'uploader_id': '300043621', 'view_count': int, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, + }, { + 'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - - status = self._download_webpage( - 'https://disk.yandex.com/auth/status', video_id, query={ - 'urlOrigin': url, - 'source': 'public', - 'md5': 'false', - }) - - sk = self._search_regex( - r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P(?:(?!\2).)+)\2', - status, 'sk', group='value') + domain, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - - models = self._parse_json( - self._search_regex( - r']+id=["\']models-client[^>]+>\s*(\[.+?\])\s*]+id="store-prefetch"[^>]*>\s*({.+?})\s*', + webpage, 'store'), video_id) + resource = store['resources'][store['rootResourceId']] + + title = resource['name'] + meta = resource.get('meta') or {} + + public_url = meta.get('short_url') + if public_url: + video_id = self._match_id(public_url) + + source_url = (self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources/download', + video_id, query={'public_key': url}, fatal=False) or {}).get('href') + video_streams = resource.get('videoStreams') or {} + video_hash = resource.get('hash') or url + environment = store.get('environment') or {} + sk = environment.get('sk') + yandexuid = environment.get('yandexuid') + if sk and yandexuid and not (source_url and video_streams): + self._set_cookie(domain, 'yandexuid', yandexuid) + + def call_api(action): + return (self._download_json( + urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ + 'hash': video_hash, + 'sk': sk, + }).encode(), headers={ + 'Content-Type': 'text/plain', + }, fatal=False) or {}).get('data') or {} + if not source_url: + # TODO: figure out how to detect if download limit has + # been reached and then avoid unnecessary source format + # extraction requests + source_url = call_api('download-url').get('url') + if not video_streams: + video_streams = call_api('get-video-streams') formats = [] if source_url: formats.append({ 'url': source_url, 'format_id': 'source', - 'ext': determine_ext(title, 'mp4'), + 'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'), 'quality': 1, + 'filesize': int_or_none(meta.get('size')) }) - for video in videos: + + for video in (video_streams.get('videos') or []): format_url = video.get('url') if not format_url: continue - if determine_ext(format_url) == 'm3u8': + if video.get('dimension') == 'adaptive': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', + format_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: + size = video.get('size') or {} + height = int_or_none(size.get('height')) + format_id = 'hls' + if height: + format_id += '-%dp' % height formats.append({ + 'ext': 'mp4', + 'format_id': format_id, + 'height': height, + 'protocol': 'm3u8_native', 'url': format_url, + 'width': int_or_none(size.get('width')), }) self._sort_formats(formats) - duration = float_or_none(try_get( - models, lambda x: x[0]['data']['duration']), 1000) - uploader = try_get( - data, lambda x: x['user']['display_name'], compat_str) - uploader_id = try_get( - data, lambda x: x['user']['uid'], compat_str) - view_count = int_or_none(try_get( - data, lambda x: x['meta']['views_counter'])) + uid = resource.get('uid') + display_name = try_get(store, lambda x: x['users'][uid]['displayName']) return { 'id': video_id, 'title': title, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, + 'duration': float_or_none(video_streams.get('duration'), 1000), + 'uploader': display_name, + 'uploader_id': uid, + 'view_count': int_or_none(meta.get('views_counter')), 'formats': formats, } diff --git a/youtube_dlc/extractor/yandexvideo.py b/youtube_dlc/extractor/yandexvideo.py index 46529be05..ab8c84c93 100644 --- a/youtube_dlc/extractor/yandexvideo.py +++ b/youtube_dlc/extractor/yandexvideo.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + try_get, url_or_none, ) @@ -13,26 +14,30 @@ class YandexVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - yandex\.ru(?:/portal/(?:video|efir))?/?\?.*?stream_id=| + yandex\.ru(?:/(?:portal/(?:video|efir)|efir))?/?\?.*?stream_id=| frontend\.vh\.yandex\.ru/player/ ) - (?P[\da-f]+) + (?P(?:[\da-f]{32}|[\w-]{12})) ''' _TESTS = [{ - 'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', + 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', 'info_dict': { - 'id': '4dbb262b4fe5cf15a215de4f34eee34d', + 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', 'ext': 'mp4', - 'title': 'В Нью-Йорке баржи и теплоход оторвались от причала и расплылись по Гудзону', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 0, - 'duration': 30, + 'title': 'Русский Вудсток - главный рок-фест в истории СССР / вДудь', + 'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa', + 'thumbnail': r're:^https?://', + 'timestamp': 1549972939, + 'duration': 5575, 'age_limit': 18, + 'upload_date': '20190212', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, }, }, { - 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda', + 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', 'only_matching': True, }, { 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', @@ -52,53 +57,88 @@ class YandexVideoIE(InfoExtractor): # DASH with DRM 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8', 'only_matching': True, + }, { + 'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - 'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id, - video_id, query={ - 'stream_options': 'hires', - 'disable_trackings': 1, - })['content'] - - content_url = url_or_none(content.get('content_url')) or url_or_none( - content['streams'][0]['url']) - title = content.get('title') or content.get('computed_title') + player = try_get((self._download_json( + 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + player(content_id: "%s") { + computed_title + content_url + description + dislikes + duration + likes + program_title + release_date + release_date_ut + release_year + restriction_age + season + start_time + streams + thumbnail + title + views_count + } +}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) + if not player or player.get('error'): + player = self._download_json( + 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + video_id, query={ + 'stream_options': 'hires', + 'disable_trackings': 1, + }) + content = player['content'] - ext = determine_ext(content_url) + title = content.get('title') or content['computed_title'] - if ext == 'm3u8': - formats = self._extract_m3u8_formats( - content_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - elif ext == 'mpd': - formats = self._extract_mpd_formats( - content_url, video_id, mpd_id='dash') - else: - formats = [{'url': content_url}] + formats = [] + streams = content.get('streams') or [] + streams.append({'url': content.get('content_url')}) + for stream in streams: + content_url = url_or_none(stream.get('url')) + if not content_url: + continue + ext = determine_ext(content_url) + if ext == 'ismc': + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + content_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({'url': content_url}) self._sort_formats(formats) - description = content.get('description') - thumbnail = content.get('thumbnail') timestamp = (int_or_none(content.get('release_date')) or int_or_none(content.get('release_date_ut')) or int_or_none(content.get('start_time'))) - duration = int_or_none(content.get('duration')) - series = content.get('program_title') - age_limit = int_or_none(content.get('restriction_age')) + season = content.get('season') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': content.get('description'), + 'thumbnail': content.get('thumbnail'), 'timestamp': timestamp, - 'duration': duration, - 'series': series, - 'age_limit': age_limit, + 'duration': int_or_none(content.get('duration')), + 'series': content.get('program_title'), + 'age_limit': int_or_none(content.get('restriction_age')), + 'view_count': int_or_none(content.get('views_count')), + 'like_count': int_or_none(content.get('likes')), + 'dislike_count': int_or_none(content.get('dislikes')), + 'season_number': int_or_none(season.get('season_number')), + 'season_id': season.get('id'), + 'release_year': int_or_none(content.get('release_year')), 'formats': formats, } diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 68b4ca944..d713c6709 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -3647,7 +3647,7 @@ def url_or_none(url): if not url or not isinstance(url, compat_str): return None url = url.strip() - return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def parse_duration(s):