diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b38b01c28..45ee65728 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -112,6 +112,7 @@ - **blinkx** - **Bloomberg** - **BokeCC** + - **BongaCams** - **BostonGlobe** - **Box** - **Bpb**: Bundeszentrale für politische Bildung @@ -146,6 +147,7 @@ - **CBS** - **CBSInteractive** - **CBSLocal** + - **CBSLocalArticle** - **cbsnews**: CBS News - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos @@ -198,6 +200,7 @@ - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 + - **CTV** - **CTVNews** - **cu.ntv.co.jp**: Nippon Television Network - **Culturebox** @@ -1119,6 +1122,7 @@ - **WeiboMobile** - **WeiqiTV**: WQTV - **Wistia** + - **WistiaPlaylist** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** - **WSJ**: Wall Street Journal diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 8dcdc4e58..130038c0d 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -39,7 +39,7 @@ class TestAllURLsMatching(unittest.TestCase): assertTab('https://www.youtube.com/embedded') assertTab('https://www.youtube.com/feed') # Own channel's home page assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') - assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') + assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) @@ -60,8 +60,8 @@ class TestAllURLsMatching(unittest.TestCase): assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') - # def test_youtube_user_matching(self): - # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) + def test_youtube_user_matching(self): + self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) diff --git a/youtube_dlc/extractor/bbc.py b/youtube_dlc/extractor/bbc.py index 54cbcdc8e..b4daee54e 100644 --- a/youtube_dlc/extractor/bbc.py +++ b/youtube_dlc/extractor/bbc.py @@ -49,22 +49,17 @@ class BBCCoUkIE(InfoExtractor): _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' - _MEDIASELECTOR_URLS = [ + _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s' + _MEDIA_SETS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + 'iptv-all', + 'pc', ] - _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' - _NAMESPACES = ( - _MEDIASELECTION_NS, - _EMP_PLAYLIST_NS, - ) - _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -261,8 +256,6 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' - def _login(self): username, password = self._get_login_info() if username is None: @@ -307,22 +300,14 @@ class BBCCoUkIE(InfoExtractor): def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) - def _findall_ns(self, element, xpath): - elements = [] - for ns in self._NAMESPACES: - elements.extend(element.findall(xpath % ns)) - return elements - def _extract_medias(self, media_selection): - error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) - if error is None: - media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) - if error is not None: - raise BBCCoUkIE.MediaSelectionError(error.get('id')) - return self._findall_ns(media_selection, './{%s}media') + error = media_selection.get('result') + if error: + raise BBCCoUkIE.MediaSelectionError(error) + return media_selection.get('media') or [] def _extract_connections(self, media): - return self._findall_ns(media, './{%s}connection') + return media.get('connection') or [] def _get_subtitles(self, media, programme_id): subtitles = {} @@ -334,13 +319,13 @@ class BBCCoUkIE(InfoExtractor): cc_url, programme_id, 'Downloading captions', fatal=False) if not isinstance(captions, compat_etree_Element): continue - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - subtitles[lang] = [ + subtitles['en'] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, ] + break return subtitles def _raise_extractor_error(self, media_selection_error): @@ -350,10 +335,10 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): last_exception = None - for mediaselector_url in self._MEDIASELECTOR_URLS: + for media_set in self._MEDIA_SETS: try: return self._download_media_selector_url( - mediaselector_url % programme_id, programme_id) + self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e @@ -362,8 +347,8 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML', + media_selection = self._download_json( + url, programme_id, 'Downloading media selection JSON', expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) @@ -377,7 +362,6 @@ class BBCCoUkIE(InfoExtractor): if kind in ('video', 'audio'): bitrate = int_or_none(media.get('bitrate')) encoding = media.get('encoding') - service = media.get('service') width = int_or_none(media.get('width')) height = int_or_none(media.get('height')) file_size = int_or_none(media.get('media_file_size')) @@ -392,8 +376,6 @@ class BBCCoUkIE(InfoExtractor): supplier = connection.get('supplier') transfer_format = connection.get('transferFormat') format_id = supplier or conn_kind or protocol - if service: - format_id = '%s_%s' % (service, format_id) # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -408,20 +390,11 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) - if re.search(self._USP_RE, href): - usp_formats = self._extract_m3u8_formats( - re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), - programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for f in usp_formats: - if f.get('height') and f['height'] > 720: - continue - formats.append(f) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) else: - if not service and not supplier and bitrate: + if not supplier and bitrate: format_id += '-%d' % bitrate fmt = { 'format_id': format_id, @@ -554,7 +527,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') error = self._search_regex( - r']+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + r']+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<', webpage, 'error', default=None) if error: raise ExtractorError(error, expected=True) @@ -607,16 +580,9 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - _MEDIASELECTOR_URLS = [ - # Provides HQ HLS streams but fails with geolocation in some cases when it's - # even not geo restricted at all - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - # Provides more formats, namely direct mp4 links, but fails on some videos with - # notukerror for non UK (?) users (e.g. - # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', - # Provides fewer formats, but works everywhere for everybody (hopefully) - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + _MEDIA_SETS = [ + 'mobile-tablet-main', + 'pc', ] _TESTS = [{ diff --git a/youtube_dlc/extractor/bongacams.py b/youtube_dlc/extractor/bongacams.py new file mode 100644 index 000000000..180542fbc --- /dev/null +++ b/youtube_dlc/extractor/bongacams.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + urlencode_postdata, +) + + +class BongaCamsIE(InfoExtractor): + _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.com)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://de.bongacams.com/azumi-8', + 'only_matching': True, + }, { + 'url': 'https://cn.bongacams.com/azumi-8', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + channel_id = mobj.group('id') + + amf = self._download_json( + 'https://%s/tools/amf.php' % host, channel_id, + data=urlencode_postdata(( + ('method', 'getRoomData'), + ('args[]', channel_id), + ('args[]', 'false'), + )), headers={'X-Requested-With': 'XMLHttpRequest'}) + + server_url = amf['localData']['videoServerUrl'] + + uploader_id = try_get( + amf, lambda x: x['performerData']['username'], compat_str) or channel_id + uploader = try_get( + amf, lambda x: x['performerData']['displayName'], compat_str) + like_count = int_or_none(try_get( + amf, lambda x: x['performerData']['loversCount'])) + + formats = self._extract_m3u8_formats( + '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), + channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': self._live_title(uploader or uploader_id), + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'age_limit': 18, + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index b5d94d60a..6e68f0960 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -127,6 +127,7 @@ from .bleacherreport import ( from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE from .bostonglobe import BostonGlobeIE from .box import BoxIE from .bpb import BpbIE @@ -1492,7 +1493,10 @@ from .weibo import ( WeiboMobileIE ) from .weiqitv import WeiqiTVIE -from .wistia import WistiaIE +from .wistia import ( + WistiaIE, + WistiaPlaylistIE, +) from .worldstarhiphop import WorldStarHipHopIE from .wsj import ( WSJIE, diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index f7b5c67bb..81c2ae650 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -2022,22 +2022,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': [SpringboardPlatformIE.ie_key()], }, - { - 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', - 'info_dict': { - 'id': 'uPDB5I9wfp8', - 'ext': 'webm', - 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', - 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', - 'upload_date': '20160219', - 'uploader': 'Pocoyo - Português (BR)', - 'uploader_id': 'PocoyoBrazil', - }, - 'add_ie': [YoutubeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, { 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', 'info_dict': { diff --git a/youtube_dlc/extractor/instagram.py b/youtube_dlc/extractor/instagram.py index c3eba0114..1eeddc3b6 100644 --- a/youtube_dlc/extractor/instagram.py +++ b/youtube_dlc/extractor/instagram.py @@ -22,7 +22,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv)/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -35,7 +35,7 @@ class InstagramIE(InfoExtractor): 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', - 'uploader': 'Naomi Leonor Phan-Quang', + 'uploader': 'B E A U T Y F O R A S H E S', 'like_count': int, 'comment_count': int, 'comments': list, @@ -95,6 +95,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://www.instagram.com/tv/aye83DjauH/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', + 'only_matching': True, }] @staticmethod @@ -122,81 +125,92 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - (video_url, description, thumbnail, timestamp, uploader, + (media, video_url, description, thumbnail, timestamp, uploader, uploader_id, like_count, comment_count, comments, height, - width) = [None] * 11 - - shared_data = try_get(webpage, - (lambda x: self._parse_json( - self._search_regex( - r'window\.__additionalDataLoaded\(\'/(?:p|tv)/(?:[^/?#&]+)/\',({.+?})\);', - x, 'additional data', default='{}'), - video_id, fatal=False), - lambda x: self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - x, 'shared data', default='{}'), - video_id, fatal=False)['entry_data']['PostPage'][0]), - None) + width) = [None] * 12 + + shared_data = self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + webpage, 'shared data', default='{}'), + video_id, fatal=False) if shared_data: media = try_get( shared_data, - (lambda x: x['graphql']['shortcode_media'], - lambda x: x['media']), + (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], + lambda x: x['entry_data']['PostPage'][0]['media']), dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - thumbnail = media.get('display_src') or media.get('thumbnail_src') - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') - - def get_count(key, kind): - return int_or_none(try_get( + # _sharedData.entry_data.PostPage is empty when authenticated (see + # https://github.com/ytdl-org/youtube-dl/pull/22880) + if not media: + additional_data = self._parse_json( + self._search_regex( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, lambda x: x['graphql']['shortcode_media'], + dict) + if media: + video_url = media.get('video_url') + height = int_or_none(media.get('dimensions', {}).get('height')) + width = int_or_none(media.get('dimensions', {}).get('width')) + description = try_get( + media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) or media.get('caption') + thumbnail = media.get('display_src') or media.get('display_url') + timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) + uploader = media.get('owner', {}).get('full_name') + uploader_id = media.get('owner', {}).get('username') + + def get_count(keys, kind): + if not isinstance(keys, (list, tuple)): + keys = [keys] + for key in keys: + count = int_or_none(try_get( media, (lambda x: x['edge_media_%s' % key]['count'], lambda x: x['%ss' % kind]['count']))) - like_count = get_count('preview_like', 'like') - comment_count = get_count('to_comment', 'comment') - - comments = [{ - 'author': comment.get('user', {}).get('username'), - 'author_id': comment.get('user', {}).get('id'), - 'id': comment.get('id'), - 'text': comment.get('text'), - 'timestamp': int_or_none(comment.get('created_at')), - } for comment in media.get( - 'comments', {}).get('nodes', []) if comment.get('text')] - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) + if count is not None: + return count + like_count = get_count('preview_like', 'like') + comment_count = get_count( + ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') + + comments = [{ + 'author': comment.get('user', {}).get('username'), + 'author_id': comment.get('user', {}).get('id'), + 'id': comment.get('id'), + 'text': comment.get('text'), + 'timestamp': int_or_none(comment.get('created_at')), + } for comment in media.get( + 'comments', {}).get('nodes', []) if comment.get('text')] + if not video_url: + edges = try_get( + media, lambda x: x['edge_sidecar_to_children']['edges'], + list) or [] + if edges: + entries = [] + for edge_num, edge in enumerate(edges, start=1): + node = try_get(edge, lambda x: x['node'], dict) + if not node: + continue + node_video_url = url_or_none(node.get('video_url')) + if not node_video_url: + continue + entries.append({ + 'id': node.get('shortcode') or node['id'], + 'title': 'Video %d' % edge_num, + 'url': node_video_url, + 'thumbnail': node.get('display_url'), + 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), + 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), + 'view_count': int_or_none(node.get('video_view_count')), + }) + return self.playlist_result( + entries, video_id, + 'Post by %s' % uploader_id if uploader_id else None, + description) if not video_url: video_url = self._og_search_video_url(webpage, secure=False) diff --git a/youtube_dlc/extractor/pornhub.py b/youtube_dlc/extractor/pornhub.py index 9ad92a8ec..2fcbd186f 100644 --- a/youtube_dlc/extractor/pornhub.py +++ b/youtube_dlc/extractor/pornhub.py @@ -288,14 +288,24 @@ class PornHubIE(PornHubBaseIE): video_urls.append((v_url, None)) video_urls_set.add(v_url) + def parse_quality_items(quality_items): + q_items = self._parse_json(quality_items, video_id, fatal=False) + if not isinstance(q_items, list): + return + for item in q_items: + if isinstance(item, dict): + add_video_url(item.get('url')) + if not video_urls: - FORMAT_PREFIXES = ('media', 'quality') + FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), default=None) if js_vars: for key, format_url in js_vars.items(): - if any(key.startswith(p) for p in FORMAT_PREFIXES): + if key.startswith(FORMAT_PREFIXES[-1]): + parse_quality_items(format_url) + elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): add_video_url(format_url) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): @@ -351,12 +361,16 @@ class PornHubIE(PornHubBaseIE): r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', webpage, 'uploader', default=None) + def extract_vote_count(kind, name): + return self._extract_count( + (r']+\bclass="votes%s"[^>]*>([\d,\.]+)' % kind, + r']+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind), + webpage, name) + view_count = self._extract_count( r'([\d,\.]+) [Vv]iews', webpage, 'view') - like_count = self._extract_count( - r']+class="votesUp"[^>]*>([\d,\.]+)', webpage, 'like') - dislike_count = self._extract_count( - r']+class="votesDown"[^>]*>([\d,\.]+)', webpage, 'dislike') + like_count = extract_vote_count('Up', 'like') + dislike_count = extract_vote_count('Down', 'dislike') comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') diff --git a/youtube_dlc/extractor/spankbang.py b/youtube_dlc/extractor/spankbang.py index 61ca902ce..37cb8c839 100644 --- a/youtube_dlc/extractor/spankbang.py +++ b/youtube_dlc/extractor/spankbang.py @@ -7,17 +7,24 @@ from ..utils import ( determine_ext, ExtractorError, merge_dicts, - orderedSet, parse_duration, parse_resolution, str_to_int, url_or_none, urlencode_postdata, + urljoin, ) class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/(?:video|play|embed)\b' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?spankbang\.com/ + (?: + (?P[\da-z]+)/(?:video|play|embed)\b| + [\da-z]+-(?P[\da-z]+)/playlist/[^/?#&]+ + ) + ''' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -57,10 +64,14 @@ class SpankBangIE(InfoExtractor): }, { 'url': 'https://spankbang.com/2y3td/embed/', 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') webpage = self._download_webpage( url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) @@ -155,30 +166,33 @@ class SpankBangIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/[^/]+' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/(?P[^/]+)' _TEST = { 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'info_dict': { 'id': 'ug0k', 'title': 'Big Ass Titties', }, - 'playlist_mincount': 50, + 'playlist_mincount': 40, } def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) entries = [self.url_result( - 'https://spankbang.com/%s/video' % video_id, - ie=SpankBangIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r']+\bhref=["\']/?([\da-z]+)/play/', webpage))] + urljoin(url, mobj.group('path')), + ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' + % re.escape(display_id), webpage)] title = self._html_search_regex( - r'

([^<]+)\s+playlist

', webpage, 'playlist title', + r'

([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dlc/extractor/sprout.py b/youtube_dlc/extractor/sprout.py index 8467bf49d..e243732f2 100644 --- a/youtube_dlc/extractor/sprout.py +++ b/youtube_dlc/extractor/sprout.py @@ -3,50 +3,62 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - extract_attributes, - update_url_query, + int_or_none, smuggle_url, + update_url_query, ) class SproutIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', - 'md5': '74bf14128578d1e040c3ebc82088f45f', + _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race', 'info_dict': { - 'id': '9dexnwtmh8_X', + 'id': 'bm0foJFaTKqb', 'ext': 'mp4', - 'title': 'A Cowboy Adventure', - 'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.', - 'timestamp': 1437758640, - 'upload_date': '20150724', - 'uploader': 'NBCU-SPROUT-NEW', - } - } + 'title': 'Robot Bike Race', + 'description': 'md5:436b1d97117cc437f54c383f4debc66d', + 'timestamp': 1606148940, + 'upload_date': '20201123', + 'uploader': 'NBCU-MPAT', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', + 'only_matching': True, + }, { + 'url': 'https://www.universalkids.com/watch/robot-bike-race', + 'only_matching': True, + }] + _GEO_COUNTRIES = ['US'] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_component = self._search_regex( - r'(?s)(]+data-component="video"[^>]*?>)', - webpage, 'video component', default=None) - if video_component: - options = self._parse_json(extract_attributes( - video_component)['data-options'], video_id) - theplatform_url = options['video'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - if options.get('protected'): - query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout') - theplatform_url = smuggle_url(update_url_query( - theplatform_url, query), {'force_smil_url': True}) - else: - iframe = self._search_regex( - r'(]+id="sproutVideoIframe"[^>]*?>)', - webpage, 'iframe') - theplatform_url = extract_attributes(iframe)['src'] - - return self.url_result(theplatform_url, 'ThePlatform') + display_id = self._match_id(url) + mpx_metadata = self._download_json( + # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/ + 'https://www.universalkids.com/_api/videos/' + display_id, + display_id)['mpxMetadata'] + media_pid = mpx_metadata['mediaPid'] + theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if mpx_metadata.get('entitlement') == 'auth': + query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout') + theplatform_url = smuggle_url( + update_url_query(theplatform_url, query), { + 'force_smil_url': True, + 'geo_countries': self._GEO_COUNTRIES, + }) + return { + '_type': 'url_transparent', + 'id': media_pid, + 'url': theplatform_url, + 'series': mpx_metadata.get('seriesName'), + 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), + 'episode_number': int_or_none(mpx_metadata.get('episodeNumber')), + 'ie_key': 'ThePlatform', + } diff --git a/youtube_dlc/extractor/theplatform.py b/youtube_dlc/extractor/theplatform.py index 41bfbe80f..adfe11e31 100644 --- a/youtube_dlc/extractor/theplatform.py +++ b/youtube_dlc/extractor/theplatform.py @@ -234,6 +234,9 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) mobj = re.match(self._VALID_URL, url) provider_id = mobj.group('provider_id') diff --git a/youtube_dlc/extractor/theweatherchannel.py b/youtube_dlc/extractor/theweatherchannel.py index c34a49d03..b2a8c3797 100644 --- a/youtube_dlc/extractor/theweatherchannel.py +++ b/youtube_dlc/extractor/theweatherchannel.py @@ -1,18 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .theplatform import ThePlatformIE from ..utils import ( determine_ext, parse_duration, + parse_iso8601, ) class TheWeatherChannelIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?weather\.com/(?:[^/]+/)*video/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?weather\.com(?P(?:/(?P[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P[^/?#]+))' _TESTS = [{ 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', - 'md5': 'ab924ac9574e79689c24c6b95e957def', + 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', 'info_dict': { 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', 'ext': 'mp4', @@ -20,18 +24,33 @@ class TheWeatherChannelIE(ThePlatformIE): 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', 'uploader': 'TWC - Digital (No Distro)', 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', + 'upload_date': '20160720', + 'timestamp': 1469018835, } + }, { + 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) - video_id = drupal_settings['twc']['contexts']['node']['uuid'] - video_data = self._download_json( - 'https://dsx.weather.com/cms/v4/asset-collection/en_US/' + video_id, video_id) + asset_name, locale, display_id = re.match(self._VALID_URL, url).groups() + if not locale: + locale = 'en-US' + video_data = list(self._download_json( + 'https://weather.com/api/v1/p/redux-dal', display_id, data=json.dumps([{ + 'name': 'getCMSAssetsUrlConfig', + 'params': { + 'language': locale.replace('-', '_'), + 'query': { + 'assetName': { + '$in': asset_name, + }, + }, + } + }]).encode(), headers={ + 'Content-Type': 'application/json', + })['dal']['getCMSAssetsUrlConfig'].values())[0]['data'][0] + video_id = video_data['id'] seo_meta = video_data.get('seometa', {}) title = video_data.get('title') or seo_meta['title'] @@ -66,6 +85,8 @@ class TheWeatherChannelIE(ThePlatformIE): }) self._sort_formats(formats) + cc_url = video_data.get('cc_url') + return { 'id': video_id, 'display_id': display_id, @@ -74,6 +95,8 @@ class TheWeatherChannelIE(ThePlatformIE): 'duration': parse_duration(video_data.get('duration')), 'uploader': video_data.get('providername'), 'uploader_id': video_data.get('providerid'), + 'timestamp': parse_iso8601(video_data.get('publishdate')), + 'subtitles': {locale[:2]: [{'url': cc_url}]} if cc_url else None, 'thumbnails': thumbnails, 'formats': formats, } diff --git a/youtube_dlc/extractor/wistia.py b/youtube_dlc/extractor/wistia.py index 77febd2eb..ae32a0a68 100644 --- a/youtube_dlc/extractor/wistia.py +++ b/youtube_dlc/extractor/wistia.py @@ -5,79 +5,34 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, - int_or_none, float_or_none, + int_or_none, + try_get, unescapeHTML, ) -class WistiaIE(InfoExtractor): - _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P[a-z0-9]{10})' +class WistiaBaseIE(InfoExtractor): + _VALID_ID_REGEX = r'(?P[a-z0-9]{10})' + _VALID_URL_BASE = r'https?://(?:fast\.)?wistia\.(?:net|com)/embed/' _EMBED_BASE_URL = 'http://fast.wistia.com/embed/' - _TESTS = [{ - 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', - 'md5': 'cafeb56ec0c53c18c97405eecb3133df', - 'info_dict': { - 'id': 'sh7fpupwlt', - 'ext': 'mov', - 'title': 'Being Resourceful', - 'description': 'a Clients From Hell Video Series video from worldwidewebhosting', - 'upload_date': '20131204', - 'timestamp': 1386185018, - 'duration': 117, - }, - }, { - 'url': 'wistia:sh7fpupwlt', - 'only_matching': True, - }, { - # with hls video - 'url': 'wistia:807fafadvk', - 'only_matching': True, - }, { - 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', - 'only_matching': True, - }, { - 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json', - 'only_matching': True, - }] - - # https://wistia.com/support/embed-and-share/video-on-your-website - @staticmethod - def _extract_url(webpage): - urls = WistiaIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): - urls = [] - for match in re.finditer( - r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): - urls.append(unescapeHTML(match.group('url'))) - for match in re.finditer( - r'''(?sx) - ]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P[a-z0-9]{10})\b(?:(?!\1).)*?\1 - ''', webpage): - urls.append('wistia:%s' % match.group('id')) - for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P[a-z0-9]{10})', webpage): - urls.append('wistia:%s' % match.group('id')) - return urls - - def _real_extract(self, url): - video_id = self._match_id(url) - - data_json = self._download_json( - self._EMBED_BASE_URL + 'medias/%s.json' % video_id, video_id, - # Some videos require this. - headers={ - 'Referer': url if url.startswith('http') else self._EMBED_BASE_URL + 'iframe/' + video_id, + def _download_embed_config(self, config_type, config_id, referer): + base_url = self._EMBED_BASE_URL + '%ss/%s' % (config_type, config_id) + embed_config = self._download_json( + base_url + '.json', config_id, headers={ + 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. }) - if data_json.get('error'): + if isinstance(embed_config, dict) and embed_config.get('error'): raise ExtractorError( 'Error while getting the playlist', expected=True) - data = data_json['media'] + return embed_config + + def _extract_media(self, embed_config): + data = embed_config['media'] + video_id = data['hashedId'] title = data['name'] formats = [] @@ -160,3 +115,85 @@ class WistiaIE(InfoExtractor): 'timestamp': int_or_none(data.get('createdAt')), 'subtitles': subtitles, } + + +class WistiaIE(WistiaBaseIE): + _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) + + _TESTS = [{ + # with hls video + 'url': 'wistia:807fafadvk', + 'md5': 'daff0f3687a41d9a71b40e0e8c2610fe', + 'info_dict': { + 'id': '807fafadvk', + 'ext': 'mp4', + 'title': 'Drip Brennan Dunn Workshop', + 'description': 'a JV Webinars video', + 'upload_date': '20160518', + 'timestamp': 1463607249, + 'duration': 4987.11, + }, + }, { + 'url': 'wistia:sh7fpupwlt', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json', + 'only_matching': True, + }] + + # https://wistia.com/support/embed-and-share/video-on-your-website + @staticmethod + def _extract_url(webpage): + urls = WistiaIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + urls = [] + for match in re.finditer( + r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): + urls.append(unescapeHTML(match.group('url'))) + for match in re.finditer( + r'''(?sx) + ]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P[a-z0-9]{10})\b(?:(?!\1).)*?\1 + ''', webpage): + urls.append('wistia:%s' % match.group('id')) + for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P[a-z0-9]{10})', webpage): + urls.append('wistia:%s' % match.group('id')) + return urls + + def _real_extract(self, url): + video_id = self._match_id(url) + embed_config = self._download_embed_config('media', video_id, url) + return self._extract_media(embed_config) + + +class WistiaPlaylistIE(WistiaBaseIE): + _VALID_URL = r'%splaylists/%s' % (WistiaIE._VALID_URL_BASE, WistiaIE._VALID_ID_REGEX) + + _TEST = { + 'url': 'https://fast.wistia.net/embed/playlists/aodt9etokc', + 'info_dict': { + 'id': 'aodt9etokc', + }, + 'playlist_count': 3, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist = self._download_embed_config('playlist', playlist_id, url) + + entries = [] + for media in (try_get(playlist, lambda x: x[0]['medias']) or []): + embed_config = media.get('embed_config') + if not embed_config: + continue + entries.append(self._extract_media(embed_config)) + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 0a1da437b..ad248a356 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -64,9 +64,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' _RESERVED_NAMES = ( - r'course|embed|channel|c|user|playlist|watch|w|results|storefront|oops|' - r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|' - r'feed/(watch_later|history|subscriptions|library|trending|recommended)') + r'embed|e|channel|c|user|playlist|watch|w|v|results|shared|' + r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|' + r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)') _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided @@ -2544,7 +2544,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): feed/| (?:playlist|watch)\?.*?\blist= )| - (?!(%s)([/#?]|$)) # Direct URLs + (?!(?:%s)\b) # Direct URLs ) (?P[^/?\#&]+) ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES @@ -2813,13 +2813,22 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): # inline playlist with not always working continuations 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', 'only_matching': True, - } - # TODO - # { - # 'url': 'https://www.youtube.com/TheYoungTurks/live', - # 'only_matching': True, - # } - ] + }, { + 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/course', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/zsecurity', + 'only_matching': True, + }, { + 'url': 'http://www.youtube.com/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, + }] def _extract_channel_id(self, webpage): channel_id = self._html_search_meta(