From d35360f7684163d57d7735d4d946837ca2b0dac6 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 21 Dec 2020 12:24:27 +0530 Subject: [PATCH] Update to bfa345744d055b2420f82336b339a51a9d73b28a [tastytrade] Remove Extractor https://github.com/ytdl-org/youtube-dl/commit/bfa345744d055b2420f82336b339a51a9d73b28a Except: Switch to GitHub actions for CI https://github.com/ytdl-org/youtube-dl/commit/a8b31505edbf4d9608b2d964b936b6720dcbb40d --- youtube_dlc/extractor/arkena.py | 152 +++++---- youtube_dlc/extractor/cnn.py | 5 +- youtube_dlc/extractor/common.py | 9 +- youtube_dlc/extractor/extractors.py | 12 +- youtube_dlc/extractor/generic.py | 55 ++++ youtube_dlc/extractor/nba.py | 480 ++++++++++++++++++++++------ youtube_dlc/extractor/niconico.py | 97 ++++-- youtube_dlc/extractor/nrk.py | 3 +- youtube_dlc/extractor/turner.py | 44 ++- youtube_dlc/extractor/youtube.py | 34 +- 10 files changed, 673 insertions(+), 218 deletions(-) diff --git a/youtube_dlc/extractor/arkena.py b/youtube_dlc/extractor/arkena.py index 854f58767..fd46b1c77 100644 --- a/youtube_dlc/extractor/arkena.py +++ b/youtube_dlc/extractor/arkena.py @@ -6,13 +6,11 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - determine_ext, ExtractorError, float_or_none, int_or_none, - mimetype2ext, parse_iso8601, - strip_jsonp, + try_get, ) @@ -20,22 +18,27 @@ class ArkenaIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - video\.arkena\.com/play2/embed/player\?| + video\.(?:arkena|qbrick)\.com/play2/embed/player\?| play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P[^/]+)/[^/]+/(?P\d+) ) ''' _TESTS = [{ - 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', - 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', + 'md5': '97f117754e5f3c020f5f26da4a44ebaf', 'info_dict': { - 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'id': 'd8ab4607-00090107-aab86310', 'ext': 'mp4', - 'title': 'Big Buck Bunny', - 'description': 'Royalty free test video', - 'timestamp': 1432816365, - 'upload_date': '20150528', - 'is_live': False, + 'title': 'EM_HT20_117_roslund_v2.mp4', + 'timestamp': 1608285912, + 'upload_date': '20201218', + 'duration': 1429.162667, + 'subtitles': { + 'sv': 'count:3', + }, }, + }, { + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'only_matching': True, }, { 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', 'only_matching': True, @@ -72,62 +75,89 @@ class ArkenaIE(InfoExtractor): if not video_id or not account_id: raise ExtractorError('Invalid URL', expected=True) - playlist = self._download_json( - 'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_' - % (video_id, account_id), - video_id, transform_source=strip_jsonp)['Playlist'][0] - - media_info = playlist['MediaInfo'] - title = media_info['Title'] - media_files = playlist['MediaFiles'] + media = self._download_json( + 'https://video.qbrick.com/api/v1/public/accounts/%s/medias/%s' % (account_id, video_id), + video_id, query={ + # https://video.qbrick.com/docs/api/examples/library-api.html + 'fields': 'asset/resources/*/renditions/*(height,id,language,links/*(href,mimeType),type,size,videos/*(audios/*(codec,sampleRate),bitrate,codec,duration,height,width),width),created,metadata/*(title,description),tags', + }) + metadata = media.get('metadata') or {} + title = metadata['title'] - is_live = False + duration = None formats = [] - for kind_case, kind_formats in media_files.items(): - kind = kind_case.lower() - for f in kind_formats: - f_url = f.get('Url') - if not f_url: - continue - is_live = f.get('Live') == 'true' - exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) - if kind == 'm3u8' or 'm3u8' in exts: - formats.extend(self._extract_m3u8_formats( - f_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=kind, fatal=False, live=is_live)) - elif kind == 'flash' or 'f4m' in exts: - formats.extend(self._extract_f4m_formats( - f_url, video_id, f4m_id=kind, fatal=False)) - elif kind == 'dash' or 'mpd' in exts: - formats.extend(self._extract_mpd_formats( - f_url, video_id, mpd_id=kind, fatal=False)) - elif kind == 'silverlight': - # TODO: process when ism is supported (see - # https://github.com/ytdl-org/youtube-dl/issues/8118) - continue - else: - tbr = float_or_none(f.get('Bitrate'), 1000) - formats.append({ - 'url': f_url, - 'format_id': '%s-%d' % (kind, tbr) if tbr else kind, - 'tbr': tbr, - }) + thumbnails = [] + subtitles = {} + for resource in media['asset']['resources']: + for rendition in (resource.get('renditions') or []): + rendition_type = rendition.get('type') + for i, link in enumerate(rendition.get('links') or []): + href = link.get('href') + if not href: + continue + if rendition_type == 'image': + thumbnails.append({ + 'filesize': int_or_none(rendition.get('size')), + 'height': int_or_none(rendition.get('height')), + 'id': rendition.get('id'), + 'url': href, + 'width': int_or_none(rendition.get('width')), + }) + elif rendition_type == 'subtitle': + subtitles.setdefault(rendition.get('language') or 'en', []).append({ + 'url': href, + }) + elif rendition_type == 'video': + f = { + 'filesize': int_or_none(rendition.get('size')), + 'format_id': rendition.get('id'), + 'url': href, + } + video = try_get(rendition, lambda x: x['videos'][i], dict) + if video: + if not duration: + duration = float_or_none(video.get('duration')) + f.update({ + 'height': int_or_none(video.get('height')), + 'tbr': int_or_none(video.get('bitrate'), 1000), + 'vcodec': video.get('codec'), + 'width': int_or_none(video.get('width')), + }) + audio = try_get(video, lambda x: x['audios'][0], dict) + if audio: + f.update({ + 'acodec': audio.get('codec'), + 'asr': int_or_none(audio.get('sampleRate')), + }) + formats.append(f) + elif rendition_type == 'index': + mime_type = link.get('mimeType') + if mime_type == 'application/smil+xml': + formats.extend(self._extract_smil_formats( + href, video_id, fatal=False)) + elif mime_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mime_type == 'application/hds+xml': + formats.extend(self._extract_f4m_formats( + href, video_id, f4m_id='hds', fatal=False)) + elif mime_type == 'application/dash+xml': + formats.extend(self._extract_f4m_formats( + href, video_id, f4m_id='hds', fatal=False)) + elif mime_type == 'application/vnd.ms-sstr+xml': + formats.extend(self._extract_ism_formats( + href, video_id, ism_id='mss', fatal=False)) self._sort_formats(formats) - description = media_info.get('Description') - video_id = media_info.get('VideoId') or video_id - timestamp = parse_iso8601(media_info.get('PublishDate')) - thumbnails = [{ - 'url': thumbnail['Url'], - 'width': int_or_none(thumbnail.get('Size')), - } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')] - return { 'id': video_id, 'title': title, - 'description': description, - 'timestamp': timestamp, - 'is_live': is_live, + 'description': metadata.get('description'), + 'timestamp': parse_iso8601(media.get('created')), 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'duration': duration, + 'tags': media.get('tags'), 'formats': formats, } diff --git a/youtube_dlc/extractor/cnn.py b/youtube_dlc/extractor/cnn.py index 774b71055..2d950fa05 100644 --- a/youtube_dlc/extractor/cnn.py +++ b/youtube_dlc/extractor/cnn.py @@ -96,7 +96,10 @@ class CNNIE(TurnerBaseIE): config['data_src'] % path, page_title, { 'default': { 'media_src': config['media_src'], - } + }, + 'f4m': { + 'host': 'cnn-vh.akamaihd.net', + }, }) diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index b230208ac..36499f0ac 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -2606,6 +2606,13 @@ class InfoExtractor(object): return entries def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): + signed = 'hdnea=' in manifest_url + if not signed: + # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html + manifest_url = re.sub( + r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?', + '', manifest_url).strip('?') + formats = [] hdcore_sign = 'hdcore=3.7.0' @@ -2631,7 +2638,7 @@ class InfoExtractor(object): formats.extend(m3u8_formats) http_host = hosts.get('http') - if http_host and m3u8_formats and 'hdnea=' not in m3u8_url: + if http_host and m3u8_formats and not signed: REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 05ef4afbe..074b7e2f2 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -343,7 +343,6 @@ from .espn import ( ) from .esri import EsriVideoIE from .europa import EuropaIE -from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE @@ -522,7 +521,6 @@ from .joj import JojIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE @@ -705,7 +703,14 @@ from .naver import ( NaverIE, NaverLiveIE, ) -from .nba import NBAIE +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) from .nbc import ( CSNNEIE, NBCIE, @@ -1163,7 +1168,6 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE -from .tastytrade import TastyTradeIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachable import ( diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index 77a1bd7f2..f7b5c67bb 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -32,6 +32,8 @@ from ..utils import ( unified_timestamp, unsmuggle_url, UnsupportedError, + url_or_none, + xpath_attr, xpath_text, ) from .commonprotocols import RtmpIE @@ -214,6 +216,33 @@ class GenericIE(InfoExtractor): }, }], }, + # RSS feed with item with description and thumbnails + { + 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'info_dict': { + 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'title': 're:.*100% Hydrogen.*', + 'description': 're:.*In this episode.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'm4a', + 'id': 'c1c879525ce2cb640b344507e682c36d', + 'title': 're:Hydrogen!', + 'description': 're:.*In this episode we are going.*', + 'timestamp': 1567977776, + 'upload_date': '20190908', + 'duration': 459, + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1, + 'season_number': 1, + 'age_limit': 0, + }, + }], + 'params': { + 'skip_download': True, + }, + }, # RSS feed with enclosures and unsupported link URLs { 'url': 'http://www.hellointernet.fm/podcast?format=rss', @@ -2190,6 +2219,10 @@ class GenericIE(InfoExtractor): playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + NS_MAP = { + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + } + entries = [] for it in doc.findall('./channel/item'): next_url = None @@ -2205,11 +2238,33 @@ class GenericIE(InfoExtractor): if not next_url: continue + def itunes(key): + return xpath_text( + it, xpath_with_ns('./itunes:%s' % key, NS_MAP), + default=None) + + duration = itunes('duration') + explicit = (itunes('explicit') or '').lower() + if explicit in ('true', 'yes'): + age_limit = 18 + elif explicit in ('false', 'no'): + age_limit = 0 + else: + age_limit = None + entries.append({ '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, 'description': xpath_text(it, 'description', default=None), + 'timestamp': unified_timestamp( + xpath_text(it, 'pubDate', default=None)), + 'duration': int_or_none(duration) or parse_duration(duration), + 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), + 'episode': itunes('title'), + 'episode_number': int_or_none(itunes('episode')), + 'season_number': int_or_none(itunes('season')), + 'age_limit': age_limit, }) return { diff --git a/youtube_dlc/extractor/nba.py b/youtube_dlc/extractor/nba.py index be295a7a3..fbc7adaf4 100644 --- a/youtube_dlc/extractor/nba.py +++ b/youtube_dlc/extractor/nba.py @@ -5,33 +5,137 @@ import re from .turner import TurnerBaseIE from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, + compat_parse_qs, + compat_str, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) from ..utils import ( + int_or_none, + merge_dicts, OnDemandPagedList, - remove_start, + parse_duration, + parse_iso8601, + try_get, + update_url_query, + urljoin, ) -class NBAIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P(?:[^/]+/)+(?P[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' +class NBACVPBaseIE(TurnerBaseIE): + def _extract_nba_cvp_info(self, path, video_id, fatal=False): + return self._extract_cvp_info( + 'http://secure.nba.com/%s' % path, video_id, { + 'default': { + 'media_src': 'http://nba.cdn.turner.com/nba/big', + }, + 'm3u8': { + 'media_src': 'http://nbavod-f.akamaihd.net', + }, + }, fatal=fatal) + + +class NBAWatchBaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/' + + def _extract_video(self, filter_key, filter_value): + video = self._download_json( + 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch', + filter_value, query={ + 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName', + 'q': filter_key + ':' + filter_value, + 'wt': 'json', + })['response']['docs'][0] + + video_id = str(video['pid']) + title = video['name'] + + formats = [] + m3u8_url = (self._download_json( + 'https://watch.nba.com/service/publishpoint', video_id, query={ + 'type': 'video', + 'format': 'json', + 'id': video_id, + }, headers={ + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', + }, fatal=False) or {}).get('path') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + for f in m3u8_formats: + http_f = f.copy() + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': http_f['url'].replace('.m3u8', ''), + }) + formats.append(http_f) + + info = { + 'id': video_id, + 'title': title, + 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')), + 'description': video.get('description'), + 'duration': int_or_none(video.get('runtime')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + 'tags': video.get('tags'), + } + + seo_name = video.get('seoName') + if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name): + base_path = '' + if seo_name.startswith('teams/'): + base_path += seo_name.split('/')[1] + '/' + base_path += 'video/' + cvp_info = self._extract_nba_cvp_info( + base_path + seo_name + '.xml', video_id, False) + if cvp_info: + formats.extend(cvp_info['formats']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + info['formats'] = formats + return info + + +class NBAWatchEmbedIE(NBAWatchBaseIE): + IENAME = 'nba:watch:embed' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P\d+)' + _TESTS = [{ + 'url': 'http://watch.nba.com/embed?id=659395', + 'md5': 'b7e3f9946595f4ca0a13903ce5edd120', + 'info_dict': { + 'id': '659395', + 'ext': 'mp4', + 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'timestamp': 1492228800, + 'upload_date': '20170415', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video('pid', video_id) + + +class NBAWatchIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': '9e7729d3010a9c71506fd1248f74e4f4', + 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', 'info_dict': { - 'id': '0021200253-okc-bkn-recap', + 'id': '70946', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, - 'timestamp': 1354638466, + 'timestamp': 1354597200, 'upload_date': '20121204', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, @@ -39,116 +143,286 @@ class NBAIE(TurnerBaseIE): 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'id': '330865', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - 'timestamp': 1432134543, - 'upload_date': '20150520', + 'timestamp': 1432094400, + 'upload_date': '20150521', }, - 'expected_warnings': ['Unable to download f4m manifest'], }, { - 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake', - 'info_dict': { - 'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324', - 'ext': 'mp4', - 'title': 'Practice: Doc Rivers - 2/16/16', - 'description': 'Head Coach Doc Rivers addresses the media following practice.', - 'upload_date': '20160216', - 'timestamp': 1455672000, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], + 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115', + 'only_matching': True, }, { - 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', - 'info_dict': { - 'id': 'timberwolves', - 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', - }, - 'playlist_count': 30, - 'params': { - # Download the whole playlist takes too long time - 'playlist_items': '1-30', - }, + # only CVP mp4 format available + 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106', + 'only_matching': True, }, { - 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', + 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + collection_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('collection', [None])[0] + if collection_id: + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % display_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id) + return self.url_result( + 'https://www.nba.com/watch/list/collection/' + collection_id, + NBAWatchCollectionIE.ie_key(), collection_id) + return self._extract_video('seoName', display_id) + + +class NBAWatchCollectionIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch:collection' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://watch.nba.com/list/collection/season-preview-2020', 'info_dict': { - 'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601', - 'ext': 'mp4', - 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', - 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', - 'upload_date': '20141212', - 'timestamp': 1418418600, + 'id': 'season-preview-2020', }, - 'params': { - 'noplaylist': True, - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], + 'playlist_mincount': 43, }] + _PAGE_SIZE = 100 - _PAGE_SIZE = 30 + def _fetch_page(self, collection_id, page): + page += 1 + videos = self._download_json( + 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id, + collection_id, 'Downloading page %d JSON metadata' % page, query={ + 'count': self._PAGE_SIZE, + 'page': page, + })['results']['videos'] + for video in videos: + program = video.get('program') or {} + seo_name = program.get('seoName') or program.get('slug') + if not seo_name: + continue + yield { + '_type': 'url', + 'id': program.get('id'), + 'title': program.get('title') or video.get('title'), + 'url': 'https://www.nba.com/watch/video/' + seo_name, + 'thumbnail': video.get('image'), + 'description': program.get('description') or video.get('description'), + 'duration': parse_duration(program.get('runtimeHours')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + } - def _fetch_page(self, team, video_id, page): - search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({ - 'type': 'teamvideo', - 'start': page * self._PAGE_SIZE + 1, - 'npp': (page + 1) * self._PAGE_SIZE + 1, - 'sort': 'recent', - 'output': 'json', - 'site': team, - }) - results = self._download_json( - search_url, video_id, note='Download page %d of playlist data' % page)['results'][0] - for item in results: - yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url'])) - - def _extract_playlist(self, orig_path, video_id, webpage): - team = orig_path.split('/')[0] - - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video because of --no-playlist') - video_path = self._search_regex( - r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path') - video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path) - return self.url_result(video_url) - - self.to_screen('Downloading playlist - add --no-playlist to just download video') - playlist_title = self._og_search_title(webpage, fatal=False) + def _real_extract(self, url): + collection_id = self._match_id(url) entries = OnDemandPagedList( - functools.partial(self._fetch_page, team, video_id), + functools.partial(self._fetch_page, collection_id), self._PAGE_SIZE) + return self.playlist_result(entries, collection_id) - return self.playlist_result(entries, team, playlist_title) - def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - orig_path = path - if path.startswith('nba/'): - path = path[3:] +class NBABaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'''(?x) + https?://(?:www\.)?nba\.com/ + (?P + blazers| + bucks| + bulls| + cavaliers| + celtics| + clippers| + grizzlies| + hawks| + heat| + hornets| + jazz| + kings| + knicks| + lakers| + magic| + mavericks| + nets| + nuggets| + pacers| + pelicans| + pistons| + raptors| + rockets| + sixers| + spurs| + suns| + thunder| + timberwolves| + warriors| + wizards + ) + (?:/play\#)?/''' + _CHANNEL_PATH_REGEX = r'video/channel|series' - if 'video/' not in path: - webpage = self._download_webpage(url, video_id) - path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/') + def _embed_url_result(self, team, content_id): + return self.url_result(update_url_query( + 'https://secure.nba.com/assets/amp/include/video/iframe.html', { + 'contentId': content_id, + 'team': team, + }), NBAEmbedIE.ie_key()) - if path == '{{id}}': - return self._extract_playlist(orig_path, video_id, webpage) + def _call_api(self, team, content_id, query, resource): + return self._download_json( + 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team, + content_id, 'Download %s JSON metadata' % resource, + query=query, headers={ + 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b', + })['response']['result'] - # See prepareContentId() of pkgCvp.js - if path.startswith('video/teams'): - path = 'video/channels/proxy/' + path[6:] + def _extract_video(self, video, team, extract_all=True): + video_id = compat_str(video['nid']) + team = video['brand'] - return self._extract_cvp_info( - 'http://www.nba.com/%s.xml' % path, video_id, { - 'default': { - 'media_src': 'http://nba.cdn.turner.com/nba/big', - }, - 'm3u8': { - 'media_src': 'http://nbavod-f.akamaihd.net', - }, + info = { + 'id': video_id, + 'title': video.get('title') or video.get('headline') or video['shortHeadline'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('published')), + } + + subtitles = {} + captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({'url': caption_url}) + + formats = [] + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'url': mp4_url, }) + + if extract_all: + source_url = video.get('videoSource') + if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'): + formats.append({ + 'format_id': 'source', + 'url': source_url, + 'preference': 1, + }) + + m3u8_url = video.get('m3u8') + if m3u8_url: + if '.akamaihd.net/i/' in m3u8_url: + formats.extend(self._extract_akamai_formats( + m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'})) + else: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + content_xml = video.get('contentXml') + if team and content_xml: + cvp_info = self._extract_nba_cvp_info( + team + content_xml, video_id, fatal=False) + if cvp_info: + formats.extend(cvp_info['formats']) + subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + else: + info.update(self._embed_url_result(team, video['videoId'])) + + info.update({ + 'formats': formats, + 'subtitles': subtitles, + }) + + return info + + def _real_extract(self, url): + team, display_id = re.match(self._VALID_URL, url).groups() + if '/play#/' in url: + display_id = compat_urllib_parse_unquote(display_id) + else: + webpage = self._download_webpage(url, display_id) + display_id = self._search_regex( + self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id') + return self._extract_url_results(team, display_id) + + +class NBAEmbedIE(NBABaseIE): + IENAME = 'nba:embed' + _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P[^?#&]+)' + _TESTS = [{ + 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', + 'only_matching': True, + }, { + 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content_id = qs['contentId'][0] + team = qs.get('team', [None])[0] + if not team: + return self.url_result( + 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key()) + video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0] + return self._extract_video(video, team) + + +class NBAIE(NBABaseIE): + IENAME = 'nba' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', + 'info_dict': { + 'id': '45039', + 'ext': 'mp4', + 'title': 'AND WE BACK.', + 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.', + 'duration': 94, + 'timestamp': 1607112000, + 'upload_date': '20201218', + }, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860', + 'only_matching': True, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoID' + + def _extract_url_results(self, team, content_id): + return self._embed_url_result(team, content_id) + + +class NBAChannelIE(NBABaseIE): + IENAME = 'nba:channel' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/blazers/video/channel/summer_league', + 'info_dict': { + 'title': 'Summer League', + }, + 'playlist_mincount': 138, + }, { + 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoSubCategory' + _PAGE_SIZE = 100 + + def _fetch_page(self, team, channel, page): + results = self._call_api(team, channel, { + 'channels': channel, + 'count': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in results: + yield self._extract_video(video, team, False) + + def _extract_url_results(self, team, content_id): + entries = OnDemandPagedList( + functools.partial(self._fetch_page, team, content_id), + self._PAGE_SIZE) + return self.playlist_result(entries, playlist_title=content_id) diff --git a/youtube_dlc/extractor/niconico.py b/youtube_dlc/extractor/niconico.py index eb07ca776..a85fc3d5c 100644 --- a/youtube_dlc/extractor/niconico.py +++ b/youtube_dlc/extractor/niconico.py @@ -1,20 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import json import datetime +import functools +import json +import math from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urlparse, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, dict_get, ExtractorError, - int_or_none, float_or_none, + InAdvancePagedList, + int_or_none, parse_duration, parse_iso8601, remove_start, @@ -181,7 +184,7 @@ class NiconicoIE(InfoExtractor): if urlh is False: login_ok = False else: - parts = compat_urlparse.urlparse(urlh.geturl()) + parts = compat_urllib_parse_urlparse(urlh.geturl()) if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': login_ok = False if not login_ok: @@ -292,7 +295,7 @@ class NiconicoIE(InfoExtractor): 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', video_id, 'Downloading flv info') - flv_info = compat_urlparse.parse_qs(flv_info_webpage) + flv_info = compat_parse_qs(flv_info_webpage) if 'url' not in flv_info: if 'deleted' in flv_info: raise ExtractorError('The video has been deleted.', @@ -437,34 +440,76 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nicovideo.jp/mylist/27411728', 'info_dict': { 'id': '27411728', 'title': 'AKB48のオールナイトニッポン', + 'description': 'md5:d89694c5ded4b6c693dea2db6e41aa08', + 'uploader': 'のっく', + 'uploader_id': '805442', }, 'playlist_mincount': 225, - } + }, { + 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', + 'only_matching': True, + }] + _PAGE_SIZE = 100 + + def _call_api(self, list_id, resource, query): + return self._download_json( + 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, + 'Downloading %s JSON metatdata' % resource, query=query, + headers={'X-Frontend-Id': 6})['data']['mylist'] + + def _parse_owner(self, item): + owner = item.get('owner') or {} + if owner: + return { + 'uploader': owner.get('name'), + 'uploader_id': owner.get('id'), + } + return {} + + def _fetch_page(self, list_id, page): + page += 1 + items = self._call_api(list_id, 'page %d' % page, { + 'page': page, + 'pageSize': self._PAGE_SIZE, + })['items'] + for item in items: + video = item.get('video') or {} + video_id = video.get('id') + if not video_id: + continue + count = video.get('count') or {} + get_count = lambda x: int_or_none(count.get(x)) + info = { + '_type': 'url', + 'id': video_id, + 'title': video.get('title'), + 'url': 'https://www.nicovideo.jp/watch/' + video_id, + 'description': video.get('shortDescription'), + 'duration': int_or_none(video.get('duration')), + 'view_count': get_count('view'), + 'comment_count': get_count('comment'), + 'ie_key': NiconicoIE.ie_key(), + } + info.update(self._parse_owner(video)) + yield info def _real_extract(self, url): list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);', - webpage, 'entries') - entries = json.loads(entries_json) - entries = [{ - '_type': 'url', - 'ie_key': NiconicoIE.ie_key(), - 'url': ('http://www.nicovideo.jp/watch/%s' % - entry['item_data']['video_id']), - } for entry in entries] - - return { - '_type': 'playlist', - 'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'), - 'id': list_id, - 'entries': entries, - } + mylist = self._call_api(list_id, 'list', { + 'pageSize': 1, + }) + entries = InAdvancePagedList( + functools.partial(self._fetch_page, list_id), + math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), + self._PAGE_SIZE) + result = self.playlist_result( + entries, list_id, mylist.get('name'), mylist.get('description')) + result.update(self._parse_owner(mylist)) + return result diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py index fdf2d7407..b545f291b 100644 --- a/youtube_dlc/extractor/nrk.py +++ b/youtube_dlc/extractor/nrk.py @@ -33,8 +33,7 @@ class NRKBaseIE(InfoExtractor): def _extract_nrk_formats(self, asset_url, video_id): if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): - return self._extract_akamai_formats( - re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) + return self._extract_akamai_formats(asset_url, video_id) asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) formats = self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) diff --git a/youtube_dlc/extractor/turner.py b/youtube_dlc/extractor/turner.py index 2964504a2..81229a54b 100644 --- a/youtube_dlc/extractor/turner.py +++ b/youtube_dlc/extractor/turner.py @@ -6,6 +6,7 @@ import re from .adobepass import AdobePassIE from ..compat import compat_str from ..utils import ( + fix_xml_ampersands, xpath_text, int_or_none, determine_ext, @@ -49,8 +50,13 @@ class TurnerBaseIE(AdobePassIE): self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token return video_url + '?hdnea=' + token - def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): - video_data = self._download_xml(data_src, video_id) + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal=False): + video_data = self._download_xml( + data_src, video_id, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=fatal) + if not video_data: + return {} video_id = video_data.attrib['id'] title = xpath_text(video_data, 'headline', fatal=True) content_id = xpath_text(video_data, 'contentId') or video_id @@ -63,12 +69,14 @@ class TurnerBaseIE(AdobePassIE): urls = [] formats = [] + thumbnails = [] + subtitles = {} rex = re.compile( r'(?P[0-9]+)x(?P[0-9]+)(?:_(?P[0-9]+))?') # Possible formats locations: files/file, files/groupFiles/files # and maybe others for video_file in video_data.findall('.//file'): - video_url = video_file.text.strip() + video_url = url_or_none(video_file.text.strip()) if not video_url: continue ext = determine_ext(video_url) @@ -108,9 +116,28 @@ class TurnerBaseIE(AdobePassIE): continue urls.append(video_url) format_id = video_file.get('bitrate') - if ext == 'smil': + if ext in ('scc', 'srt', 'vtt'): + subtitles.setdefault('en', []).append({ + 'ext': ext, + 'url': video_url, + }) + elif ext == 'png': + thumbnails.append({ + 'id': format_id, + 'url': video_url, + }) + elif ext == 'smil': formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) + elif re.match(r'https?://[^/]+\.akamaihd\.net/[iz]/', video_url): + formats.extend(self._extract_akamai_formats( + video_url, video_id, { + 'hds': path_data.get('f4m', {}).get('host'), + # nba.cdn.turner.com, ht.cdn.turner.com, ht2.cdn.turner.com + # ht3.cdn.turner.com, i.cdn.turner.com, s.cdn.turner.com + # ssl.cdn.turner.com + 'http': 'pmd.cdn.turner.com', + })) elif ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', @@ -129,7 +156,7 @@ class TurnerBaseIE(AdobePassIE): 'url': video_url, 'ext': ext, } - mobj = rex.search(format_id + video_url) + mobj = rex.search(video_url) if mobj: f.update({ 'width': int(mobj.group('width')), @@ -152,7 +179,6 @@ class TurnerBaseIE(AdobePassIE): formats.append(f) self._sort_formats(formats) - subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): track_url = url_or_none(track.get('url')) @@ -168,12 +194,12 @@ class TurnerBaseIE(AdobePassIE): }.get(source.get('format')) }) - thumbnails = [{ - 'id': image.get('cut'), + thumbnails.extend({ + 'id': image.get('cut') or image.get('name'), 'url': image.text, 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data.findall('images/image')] + } for image in video_data.findall('images/image')) is_live = xpath_text(video_data, 'isLive') == 'true' diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index f03637c0a..b75d42410 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -324,6 +324,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), video_id) + def _extract_ytcfg(self, video_id, webpage): + return self._parse_json( + self._search_regex( + r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', + default='{}'), video_id, fatal=False) + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' @@ -2381,16 +2387,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # annotations video_annotations = None if self._downloader.params.get('writeannotations', False): - xsrf_token = self._search_regex( - r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P[A-Za-z0-9+/=]+)\2', - video_webpage, 'xsrf token', group='xsrf_token', fatal=False) + xsrf_token = None + ytcfg = self._extract_ytcfg(video_id, video_webpage) + if ytcfg: + xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str) + if not xsrf_token: + xsrf_token = self._search_regex( + r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P(?:(?!\2).)+)\2', + video_webpage, 'xsrf token', group='xsrf_token', fatal=False) invideo_url = try_get( player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) if xsrf_token and invideo_url: - xsrf_field_name = self._search_regex( - r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P\w+)\2', - video_webpage, 'xsrf field name', - group='xsrf_field_name', default='session_token') + xsrf_field_name = None + if ytcfg: + xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str) + if not xsrf_field_name: + xsrf_field_name = self._search_regex( + r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P\w+)\2', + video_webpage, 'xsrf field name', + group='xsrf_field_name', default='session_token') video_annotations = self._download_webpage( self._proto_relative_url(invideo_url), video_id, note='Downloading annotations', @@ -3276,10 +3291,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield alert_type, message def _extract_identity_token(self, webpage, item_id): - ytcfg = self._parse_json( - self._search_regex( - r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', - default='{}'), item_id, fatal=False) + ytcfg = self._extract_ytcfg(item_id, webpage) if ytcfg: token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) if token: