diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0b183b272..083c0ee6d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -313,6 +313,7 @@ - **FrontendMasters** - **FrontendMastersCourse** - **FrontendMastersLesson** + - **FujiTVFODPlus7** - **Funimation** - **Funk** - **Fusion** @@ -493,6 +494,7 @@ - **META** - **metacafe** - **Metacritic** + - **mewatch** - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** @@ -719,6 +721,7 @@ - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuantumTV** + - **Qub** - **Quickline** - **QuicklineLive** - **R7** @@ -923,6 +926,7 @@ - **ThisAV** - **ThisOldHouse** - **TikTok** + - **TikTokUser** (Currently broken) - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** @@ -961,6 +965,7 @@ - **TVANouvellesArticle** - **TVC** - **TVCArticle** + - **TVer** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** @@ -1142,6 +1147,8 @@ - **yahoo:japannews**: Yahoo! Japan News - **YandexDisk** - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:artist:albums**: Яндекс.Музыка - Артист - Альбомы + - **yandexmusic:artist:tracks**: Яндекс.Музыка - Артист - Треки - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек - **YandexVideo** diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index ef6fe0a78..a5c8e1df6 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -1679,7 +1679,7 @@ class YoutubeDL(object): if req_format is None: req_format = self._default_format_spec(info_dict, download=download) if self.params.get('verbose'): - self.to_stdout('[debug] Default format spec: %s' % req_format) + self._write_string('[debug] Default format spec: %s\n' % req_format) format_selector = self.build_format_selector(req_format) @@ -1889,7 +1889,7 @@ class YoutubeDL(object): for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info, subtitle) subtitles_are_requested = any([self.params.get('writesubtitles', False), diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index aacdf06fe..a5df94e9c 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -2615,33 +2615,32 @@ class InfoExtractor(object): hls_host = hosts.get('hls') if hls_host: m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) http_host = hosts.get('http') - if http_host and 'hdnea=' not in manifest_url: - REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + if http_host and m3u8_formats and 'hdnea=' not in m3u8_url: + REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) - if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + if len(m3u8_formats) in (qualities_length, qualities_length + 1): i = 0 - http_formats = [] - for f in formats: - if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for f in m3u8_formats: + if f['vcodec'] != 'none': for protocol in ('http', 'https'): http_f = f.copy() del http_f['manifest_url'] http_url = re.sub( - REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) http_f.update({ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), 'url': http_url, 'protocol': protocol, }) - http_formats.append(http_f) + formats.append(http_f) i += 1 - formats.extend(http_formats) return formats diff --git a/youtube_dlc/extractor/cspan.py b/youtube_dlc/extractor/cspan.py index 67d6df4b0..766942146 100644 --- a/youtube_dlc/extractor/cspan.py +++ b/youtube_dlc/extractor/cspan.py @@ -10,6 +10,8 @@ from ..utils import ( find_xpath_attr, get_element_by_class, int_or_none, + js_to_json, + merge_dicts, smuggle_url, unescapeHTML, ) @@ -98,6 +100,26 @@ class CSpanIE(InfoExtractor): bc_attr['data-bcid']) return self.url_result(smuggle_url(bc_url, {'source_url': url})) + def add_referer(formats): + for f in formats: + f.setdefault('http_headers', {})['Referer'] = url + + # As of 01.12.2020 this path looks to cover all cases making the rest + # of the code unnecessary + jwsetup = self._parse_json( + self._search_regex( + r'(?s)jwsetup\s*=\s*({.+?})\s*;', webpage, 'jwsetup', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if jwsetup: + info = self._parse_jwplayer_data( + jwsetup, video_id, require_title=False, m3u8_id='hls', + base_url=url) + add_referer(info['formats']) + ld_info = self._search_json_ld(webpage, video_id, default={}) + return merge_dicts(info, ld_info) + + # Obsolete # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) @@ -165,6 +187,7 @@ class CSpanIE(InfoExtractor): formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] + add_referer(formats) self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), diff --git a/youtube_dlc/extractor/drtv.py b/youtube_dlc/extractor/drtv.py index 390e79f8c..c0036adb6 100644 --- a/youtube_dlc/extractor/drtv.py +++ b/youtube_dlc/extractor/drtv.py @@ -29,7 +29,7 @@ class DRTVIE(InfoExtractor): https?:// (?: (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| - (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/ + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P[\da-z_-]+) ''' @@ -111,6 +111,9 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/program/jagten_220924', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 5ad9d2717..0b2b234b6 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -409,6 +409,7 @@ from .frontendmasters import ( FrontendMastersLessonIE, FrontendMastersCourseIE ) +from .fujitv import FujiTVFODPlus7IE from .funimation import FunimationIE from .funk import FunkIE from .fusion import FusionIE @@ -1220,7 +1221,10 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) -from .toggle import ToggleIE +from .toggle import ( + ToggleIE, + MeWatchIE, +) from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE from .toutv import TouTvIE @@ -1253,7 +1257,10 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE -from .tva import TVAIE +from .tva import ( + TVAIE, + QubIE, +) from .tvanouvelles import ( TVANouvellesIE, TVANouvellesArticleIE, @@ -1262,6 +1269,7 @@ from .tvc import ( TVCIE, TVCArticleIE, ) +from .tver import TVerIE from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE @@ -1515,6 +1523,8 @@ from .yandexmusic import ( YandexMusicTrackIE, YandexMusicAlbumIE, YandexMusicPlaylistIE, + YandexMusicArtistTracksIE, + YandexMusicArtistAlbumsIE, ) from .yandexvideo import YandexVideoIE from .yapfiles import YapFilesIE diff --git a/youtube_dlc/extractor/fujitv.py b/youtube_dlc/extractor/fujitv.py new file mode 100644 index 000000000..39685e075 --- /dev/null +++ b/youtube_dlc/extractor/fujitv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class FujiTVFODPlus7IE(InfoExtractor): + _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P[0-9a-z]+)' + _BASE_URL = 'http://i.fod.fujitv.co.jp/' + _BITRATE_MAP = { + 300: (320, 180), + 800: (640, 360), + 1200: (1280, 720), + 2000: (1280, 720), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = self._extract_m3u8_formats( + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + for f in formats: + wh = self._BITRATE_MAP.get(f.get('tbr')) + if wh: + f.update({ + 'width': wh[0], + 'height': wh[1], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + 'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id, + } diff --git a/youtube_dlc/extractor/gamespot.py b/youtube_dlc/extractor/gamespot.py index 4236a5ed8..7a1beae3c 100644 --- a/youtube_dlc/extractor/gamespot.py +++ b/youtube_dlc/extractor/gamespot.py @@ -1,16 +1,7 @@ from __future__ import unicode_literals -import re - from .once import OnceIE -from ..compat import ( - compat_urllib_parse_unquote, -) -from ..utils import ( - unescapeHTML, - url_basename, - dict_get, -) +from ..compat import compat_urllib_parse_unquote class GameSpotIE(OnceIE): @@ -24,17 +15,16 @@ class GameSpotIE(OnceIE): 'title': 'Arma 3 - Community Guide: SITREP I', 'description': 'Check out this video where some of the basics of Arma 3 is explained.', }, + 'skip': 'manifest URL give HTTP Error 404: Not Found', }, { 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', + 'md5': '173ea87ad762cf5d3bf6163dceb255a6', 'info_dict': { 'id': 'gs-2300-6424837', 'ext': 'mp4', 'title': 'Now Playing - The Witcher 3: Wild Hunt', 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', }, - 'params': { - 'skip_download': True, # m3u8 downloads - }, }, { 'url': 'https://www.gamespot.com/videos/embed/6439218/', 'only_matching': True, @@ -49,90 +39,40 @@ class GameSpotIE(OnceIE): def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - data_video_json = self._search_regex( - r'data-video=["\'](.*?)["\']', webpage, 'data video') - data_video = self._parse_json(unescapeHTML(data_video_json), page_id) + data_video = self._parse_json(self._html_search_regex( + r'data-video=(["\'])({.*?})\1', webpage, + 'video data', group=2), page_id) + title = compat_urllib_parse_unquote(data_video['title']) streams = data_video['videoStreams'] - - manifest_url = None formats = [] - f4m_url = streams.get('f4m_stream') - if f4m_url: - manifest_url = f4m_url - formats.extend(self._extract_f4m_formats( - f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) + + m3u8_url = streams.get('adaptive_stream') if m3u8_url: - manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) - if progressive_url and manifest_url: - qualities_basename = self._search_regex( - r'/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if qualities_basename: - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if qualities: - qualities = list(map(lambda q: int(q), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename) - http_url_basename = url_basename(progressive_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) + for f in m3u8_formats: + formats.append(f) + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': f['url'].replace('.m3u8', '.mp4'), + }) + formats.append(http_f) - onceux_json = self._search_regex( - r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None) - if onceux_json: - onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') - if onceux_url: - formats.extend(self._extract_once_formats(re.sub( - r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url), - http_formats_preference=-1)) + mpd_url = streams.get('adaptive_dash') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, page_id, mpd_id='dash', fatal=False)) - if not formats: - for quality in ['sd', 'hd']: - # It's actually a link to a flv file - flv_url = streams.get('f4m_{0}'.format(quality)) - if flv_url is not None: - formats.append({ - 'url': flv_url, - 'ext': 'flv', - 'format_id': quality, - }) self._sort_formats(formats) return { - 'id': data_video['guid'], + 'id': data_video.get('guid') or page_id, 'display_id': page_id, - 'title': compat_urllib_parse_unquote(data_video['title']), + 'title': title, 'formats': formats, 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index e5d29f316..965c25be6 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -201,11 +201,19 @@ class GenericIE(InfoExtractor): { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } + 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'title': 'MSNBC Rachel Maddow (video)', + 'description': 're:.*her unique approach to storytelling.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'mov', + 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', + 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'description': 're:.*her unique approach to storytelling.*', + 'upload_date': '20201204', + }, + }], }, # RSS feed with enclosures and unsupported link URLs { @@ -2202,6 +2210,7 @@ class GenericIE(InfoExtractor): '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, + 'description': xpath_text(it, 'description', default=None), }) return { diff --git a/youtube_dlc/extractor/mediaset.py b/youtube_dlc/extractor/mediaset.py index 933df1495..2c16fc9e2 100644 --- a/youtube_dlc/extractor/mediaset.py +++ b/youtube_dlc/extractor/mediaset.py @@ -23,7 +23,7 @@ class MediasetIE(ThePlatformBaseIE): https?:// (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: - (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| player/index\.html\?.*?\bprogramGuid= ) )(?P[0-9A-Z]{16,}) @@ -88,6 +88,9 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dlc/extractor/nhk.py b/youtube_dlc/extractor/nhk.py index de6a707c4..6a61a47d2 100644 --- a/youtube_dlc/extractor/nhk.py +++ b/youtube_dlc/extractor/nhk.py @@ -10,7 +10,7 @@ class NhkVodIE(InfoExtractor): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # clip + # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -21,6 +21,19 @@ class NhkVodIE(InfoExtractor): 'timestamp': 1565965194, 'upload_date': '20190816', }, + }, { + # audio clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + 'info_dict': { + 'id': 'r_inventions-20201104-1-en', + 'ext': 'm4a', + 'title': "Japan's Top Inventions - Miniature Video Cameras", + 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py index 4a395546f..fdf2d7407 100644 --- a/youtube_dlc/extractor/nrk.py +++ b/youtube_dlc/extractor/nrk.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools +import random import re from .common import InfoExtractor @@ -12,17 +14,57 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - js_to_json, - NO_DEFAULT, parse_age_limit, parse_duration, try_get, + urljoin, url_or_none, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + _CDN_REPL_REGEX = r'''(?x):// + (?: + nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0| + nrk-od-no\.telenorcdn\.net| + minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no + )/''' + + def _extract_nrk_formats(self, asset_url, video_id): + if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): + return self._extract_akamai_formats( + re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) + formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if not formats and re.search(self._CDN_REPL_REGEX, asset_url): + formats = self._extract_m3u8_formats( + re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + return formats + + def _raise_error(self, data): + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + + def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): + return self._download_json( + urljoin('http://psapi.nrk.no/', path), + video_id, note or 'Downloading %s JSON' % item, + fatal=fatal, query=query) class NRKIE(NRKBaseIE): @@ -41,7 +83,7 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', 'info_dict': { 'id': '150533', 'ext': 'mp4', @@ -55,7 +97,7 @@ class NRKIE(NRKBaseIE): # MD5 is unstable 'info_dict': { 'id': '154915', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, @@ -78,9 +120,15 @@ class NRKIE(NRKBaseIE): }] def _extract_from_playback(self, video_id): - manifest = self._download_json( - 'http://psapi.nrk.no/playback/manifest/%s' % video_id, - video_id, 'Downloading manifest JSON') + path_templ = 'playback/%s/' + video_id + + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) playable = manifest['playable'] @@ -94,14 +142,10 @@ class NRKIE(NRKBaseIE): if not format_url: continue if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_nrk_formats(format_url, video_id)) self._sort_formats(formats) - data = self._download_json( - 'http://psapi.nrk.no/playback/metadata/%s' % video_id, - video_id, 'Downloading metadata JSON') + data = call_playback_api('metadata') preplay = data['preplay'] titles = preplay['titles'] @@ -143,29 +187,22 @@ class NRKIE(NRKBaseIE): class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P\d+))? - ''' % _EPISODE_RE + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { 'id': 'MDDP12000117AA', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, + 'duration': 2223.44, 'age_limit': 6, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', @@ -175,7 +212,6 @@ class NRKTVIE(NRKBaseIE): 'series': '20 spørsmål', 'episode': '23.05.2014', }, - 'skip': 'NoProgramRights', }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { @@ -183,7 +219,7 @@ class NRKTVIE(NRKBaseIE): 'ext': 'mp4', 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, + 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': '24.05.2014', }, @@ -194,51 +230,25 @@ class NRKTVIE(NRKBaseIE): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Video is geo restricted'], + 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], 'info_dict': { - 'id': 'MSPO40010515', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'description': 'md5:c03aba1e917561eface5214020551b7a', }, - 'expected_warnings': ['Video is geo restricted'], + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { @@ -269,12 +279,16 @@ class NRKTVIE(NRKBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'ProgramRightsHasExpired', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, }, { 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315', + 'only_matching': True, }] _api_host = None @@ -295,6 +309,7 @@ class NRKTVIE(NRKBaseIE): title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id + urls = [] entries = [] conviva = data.get('convivaStatistics') or {} @@ -311,19 +326,14 @@ class NRKTVIE(NRKBaseIE): else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) for num, asset in enumerate(media_assets, 1): asset_url = asset.get('url') - if not asset_url: + if not asset_url or asset_url in urls: continue - formats = self._extract_akamai_formats(asset_url, video_id) + urls.append(asset_url) + formats = self._extract_nrk_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) - # Some f4m streams may not work with hdcore in fragments' URLs - for f in formats: - extra_param = f.get('extra_param_to_segment_url') - if extra_param and 'hdcore' in extra_param: - del f['extra_param_to_segment_url'] - entry_id, entry_title = video_id_and_title(num) duration = parse_duration(asset.get('duration')) subtitles = {} @@ -339,38 +349,26 @@ class NRKTVIE(NRKBaseIE): 'duration': duration, 'subtitles': subtitles, 'formats': formats, + 'is_live': live, }) if not entries: media_url = data.get('mediaUrl') - if media_url: - formats = self._extract_akamai_formats(media_url, video_id) - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - }] + if media_url and media_url not in urls: + formats = self._extract_nrk_formats(media_url, video_id) + if formats: + self._sort_formats(formats) + duration = parse_duration(data.get('duration')) + entries = [{ + 'id': video_id, + 'title': make_title(title), + 'duration': duration, + 'formats': formats, + 'is_live': live, + }] if not entries: - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, MESSAGES.get( - message_type, message_type)), - expected=True) + self._raise_error(data) series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') @@ -512,57 +510,98 @@ class NRKTVEpisodeIE(InfoExtractor): return info -class NRKTVSerieBaseIE(InfoExtractor): - def _extract_series(self, webpage, display_id, fatal=True): - config = self._parse_json( - self._search_regex( - (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*'), - webpage, 'config', default='{}' if not fatal else NO_DEFAULT), - display_id, fatal=False, transform_source=js_to_json) - if not config: - return - return try_get( - config, - (lambda x: x['initialState']['series'], lambda x: x['series']), - dict) - - def _extract_seasons(self, seasons): - if not isinstance(seasons, list): - return [] - entries = [] - for season in seasons: - entries.extend(self._extract_episodes(season)) - return entries - - def _extract_episodes(self, season): - if not isinstance(season, dict): - return [] - return self._extract_entries(season.get('episodes')) - +class NRKTVSerieBaseIE(NRKBaseIE): def _extract_entries(self, entry_list): if not isinstance(entry_list, list): return [] entries = [] for episode in entry_list: - nrk_id = episode.get('prfId') + nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue + if not re.match(NRKTVIE._EPISODE_RE, nrk_id): + continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _extract_assets_key(self, embedded): + for asset_key in self._ASSETS_KEYS: + if embedded.get(asset_key): + return asset_key + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') or data + if not isinstance(embedded, dict): + break + assets_key = self._extract_assets_key(embedded) + if not assets_key: + break + # Extract entries + entries = try_get( + embedded, + (lambda x: x[assets_key]['_embedded'][assets_key], + lambda x: x[assets_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + next_url_path = try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str) + if not next_url_path: + break + data = self._call_api( + next_url_path, display_id, + note='Downloading %s JSON page %d' % (assets_key, page_num), + fatal=False) + if not data: + break + class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?Ptv|radio)\.nrk\.no/serie/(?P[^/]+)/(?:sesong/)?(?P\d+)' + _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { - 'id': '1', + 'id': 'backstage/1', 'title': 'Sesong 1', }, 'playlist_mincount': 30, - } + }, { + # no /sesong/ in path + 'url': 'https://tv.nrk.no/serie/lindmo/2016', + 'info_dict': { + 'id': 'lindmo/2016', + 'title': '2016', + }, + 'playlist_mincount': 29, + }, { + # weird nested _embedded in catalog JSON response + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1', + 'info_dict': { + 'id': 'dickie-dick-dickens/1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 11, + }, { + # 841 entries, multi page + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509', + 'info_dict': { + 'id': 'dagsnytt/201509', + 'title': 'September 2015', + }, + 'playlist_mincount': 841, + }, { + # 180 entries, single page + 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', + 'only_matching': True, + }] @classmethod def suitable(cls, url): @@ -570,25 +609,35 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) + domain, serie, season_id = re.match(self._VALID_URL, url).groups() + display_id = '%s/%s' % (serie, season_id) - series = self._extract_series(webpage, display_id) + data = self._call_api( + '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + display_id, 'season', query={'pageSize': 50}) - season = next( - s for s in series['seasons'] - if int(display_id) == s.get('seasonNumber')) - - title = try_get(season, lambda x: x['titles']['title'], compat_str) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( - self._extract_episodes(season), display_id, title) + self._entries(data, display_id), + display_id, title) class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' + _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P[^/]+)' _TESTS = [{ + # new layout, instalments + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 90, + }, { + # new layout, instalments, more entries + 'url': 'https://tv.nrk.no/serie/lindmo', + 'only_matching': True, + }, { 'url': 'https://tv.nrk.no/serie/blank', 'info_dict': { 'id': 'blank', @@ -602,25 +651,16 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'info_dict': { 'id': 'backstage', 'title': 'Backstage', - 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', + 'description': 'md5:63692ceb96813d9a207e9910483d948b', }, 'playlist_mincount': 60, - }, { - # new layout, instalments - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 10, }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', + 'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.', }, 'playlist_mincount': 3, }, { @@ -632,6 +672,17 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://tv.nrk.no/serie/postmann-pat', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', + 'info_dict': { + 'id': 'dickie-dick-dickens', + 'title': 'Dickie Dick Dickens', + 'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://nrksuper.no/serie/labyrint', + 'only_matching': True, }] @classmethod @@ -642,43 +693,42 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - series_id = self._match_id(url) - - webpage = self._download_webpage(url, series_id) - - # New layout (e.g. https://tv.nrk.no/serie/backstage) - series = self._extract_series(webpage, series_id, fatal=False) - if series: - title = try_get(series, lambda x: x['titles']['title'], compat_str) - description = try_get( - series, lambda x: x['titles']['subtitle'], compat_str) - entries = [] - entries.extend(self._extract_seasons(series.get('seasons'))) - entries.extend(self._extract_entries(series.get('instalments'))) - entries.extend(self._extract_episodes(series.get('extraMaterial'))) - return self.playlist_result(entries, series_id, title, description) - - # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] + site, series_id = re.match(self._VALID_URL, url).groups() + is_radio = site == 'radio.nrk' + domain = 'radio' if is_radio else 'tv' + + size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' + series = self._call_api( + '%s/catalog/series/%s' % (domain, series_id), + series_id, 'serie', query={size_prefix + 'ageSize': 50}) + titles = try_get(series, [ + lambda x: x['titles'], + lambda x: x[x['type']]['titles'], + lambda x: x[x['seriesType']]['titles'], + ]) or {} - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - if title: - title = self._search_regex( - r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) + entries = [] + entries.extend(self._entries(series, series_id)) + embedded = series.get('_embedded') or {} + linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or [] + embedded_seasons = embedded.get('seasons') or [] + if len(linked_seasons) > len(embedded_seasons): + for season in linked_seasons: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + entries.append(self.url_result( + 'https://%s.nrk.no/serie/%s/sesong/%s' + % (domain, series_id, season_name), + ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) + else: + for season in embedded_seasons: + entries.extend(self._entries(season, series_id)) + entries.extend(self._entries( + embedded.get('extraMaterial') or {}, series_id)) - return self.playlist_result(entries, series_id, title, description) + return self.playlist_result( + entries, series_id, titles.get('title'), titles.get('subtitle')) class NRKTVDirekteIE(NRKTVIE): @@ -782,14 +832,8 @@ class NRKSkoleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id, - video_id) - - nrk_id = self._parse_json( - self._search_regex( - r']+type=["\']application/json["\'][^>]*>({.+?})', - webpage, 'application json'), - video_id)['activeMedia']['psId'] + nrk_id = self._download_json( + 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, + video_id)['psId'] return self.url_result('nrk:%s' % nrk_id) diff --git a/youtube_dlc/extractor/peertube.py b/youtube_dlc/extractor/peertube.py index 48fb95416..c39d12728 100644 --- a/youtube_dlc/extractor/peertube.py +++ b/youtube_dlc/extractor/peertube.py @@ -541,6 +541,10 @@ class PeerTubeIE(InfoExtractor): 'format_id': format_id, 'filesize': file_size, }) + if format_id == '0p': + f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) formats.append(f) self._sort_formats(formats) diff --git a/youtube_dlc/extractor/pornhub.py b/youtube_dlc/extractor/pornhub.py index 529f3f711..9ad92a8ec 100644 --- a/youtube_dlc/extractor/pornhub.py +++ b/youtube_dlc/extractor/pornhub.py @@ -31,7 +31,12 @@ class PornHubBaseIE(InfoExtractor): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) - webpage, urlh = dl(*args, **kwargs) + ret = dl(*args, **kwargs) + + if not ret: + return ret + + webpage, urlh = ret if any(re.search(p, webpage) for p in ( r']+\bonload=["\']go\(\)', @@ -53,7 +58,7 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) @@ -152,6 +157,9 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933', + 'only_matching': True, }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, @@ -160,7 +168,7 @@ class PornHubIE(PornHubBaseIE): @staticmethod def _extract_urls(webpage): return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -346,9 +354,9 @@ class PornHubIE(PornHubBaseIE): view_count = self._extract_count( r'([\d,\.]+) [Vv]iews', webpage, 'view') like_count = self._extract_count( - r'([\d,\.]+)', webpage, 'like') + r']+class="votesUp"[^>]*>([\d,\.]+)', webpage, 'like') dislike_count = self._extract_count( - r'([\d,\.]+)', webpage, 'dislike') + r']+class="votesDown"[^>]*>([\d,\.]+)', webpage, 'dislike') comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') @@ -422,7 +430,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -490,7 +498,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?P(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -605,7 +613,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { diff --git a/youtube_dlc/extractor/teachable.py b/youtube_dlc/extractor/teachable.py index a75369dbe..6f264bddc 100644 --- a/youtube_dlc/extractor/teachable.py +++ b/youtube_dlc/extractor/teachable.py @@ -269,7 +269,7 @@ class TeachableCourseIE(TeachableBaseIE): r'(?s)(?P
  • ]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?
  • )', webpage): li = mobj.group('li') - if 'fa-youtube-play' not in li: + if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): continue lecture_url = self._search_regex( r']+href=(["\'])(?P(?:(?!\1).)+)\1', li, diff --git a/youtube_dlc/extractor/toggle.py b/youtube_dlc/extractor/toggle.py index ca2e36efe..91b8023b8 100644 --- a/youtube_dlc/extractor/toggle.py +++ b/youtube_dlc/extractor/toggle.py @@ -11,13 +11,13 @@ from ..utils import ( float_or_none, int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, ) class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P[0-9]+)' + _VALID_URL = r'(?:https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}|toggle:)(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -84,28 +84,12 @@ class ToggleIE(InfoExtractor): 'only_matching': True, }] - _FORMAT_PREFERENCES = { - 'wvm-STBMain': -10, - 'wvm-iPadMain': -20, - 'wvm-iPhoneMain': -30, - 'wvm-Android': -40, - } _API_USER = 'tvpapi_147' _API_PASS = '11111' def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, note='Downloading video page') - - api_user = self._search_regex( - r'apiUser\s*:\s*(["\'])(?P.+?)\1', webpage, 'apiUser', - default=self._API_USER, group='user') - api_pass = self._search_regex( - r'apiPass\s*:\s*(["\'])(?P.+?)\1', webpage, 'apiPass', - default=self._API_PASS, group='pass') - params = { 'initObj': { 'Locale': { @@ -118,17 +102,16 @@ class ToggleIE(InfoExtractor): 'SiteGuid': 0, 'DomainID': '0', 'UDID': '', - 'ApiUser': api_user, - 'ApiPass': api_pass + 'ApiUser': self._API_USER, + 'ApiPass': self._API_PASS }, 'MediaID': video_id, 'mediaType': 0, } - req = sanitized_Request( + info = self._download_json( 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', - json.dumps(params).encode('utf-8')) - info = self._download_json(req, video_id, 'Downloading video info json') + video_id, 'Downloading video info json', data=json.dumps(params).encode('utf-8')) title = info['MediaName'] @@ -141,11 +124,16 @@ class ToggleIE(InfoExtractor): vid_format = vid_format.replace(' ', '') # if geo-restricted, m3u8 is inaccessible, but mp4 is okay if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, ext='mp4', m3u8_id=vid_format, note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, - fatal=False)) + fatal=False) + for f in m3u8_formats: + # Apple FairPlay Streaming + if '/fpshls/' in f['url']: + continue + formats.append(f) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id=vid_format, @@ -158,28 +146,21 @@ class ToggleIE(InfoExtractor): note='Downloading %s ISM manifest' % vid_format, errnote='Failed to download %s ISM manifest' % vid_format, fatal=False)) - elif ext in ('mp4', 'wvm'): - # wvm are drm-protected files + elif ext == 'mp4': formats.append({ 'ext': ext, 'url': video_url, 'format_id': vid_format, - 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, - 'format_note': 'DRM-protected video' if ext == 'wvm' else None }) if not formats: + for meta in (info.get('Metas') or []): + if meta.get('Key') == 'Encryption' and meta.get('Value') == '1': + raise ExtractorError( + 'This video is DRM protected.', expected=True) # Most likely because geo-blocked raise ExtractorError('No downloadable videos found', expected=True) self._sort_formats(formats) - duration = int_or_none(info.get('Duration')) - description = info.get('Description') - created_at = parse_iso8601(info.get('CreationDate') or None) - - average_rating = float_or_none(info.get('Rating')) - view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) - like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) - thumbnails = [] for picture in info.get('Pictures', []): if not isinstance(picture, dict): @@ -199,15 +180,46 @@ class ToggleIE(InfoExtractor): }) thumbnails.append(thumbnail) + def counter(prefix): + return int_or_none( + info.get(prefix + 'Counter') or info.get(prefix.lower() + '_counter')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': created_at, - 'average_rating': average_rating, - 'view_count': view_count, - 'like_count': like_count, + 'description': strip_or_none(info.get('Description')), + 'duration': int_or_none(info.get('Duration')), + 'timestamp': parse_iso8601(info.get('CreationDate') or None), + 'average_rating': float_or_none(info.get('Rating')), + 'view_count': counter('View'), + 'like_count': counter('Like'), 'thumbnails': thumbnails, 'formats': formats, } + + +class MeWatchIE(InfoExtractor): + IE_NAME = 'mewatch' + _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[0-9a-zA-Z-]+-(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', + 'info_dict': { + 'id': '1008625', + 'ext': 'mp4', + 'title': 'Recipe Of Life 味之道', + 'timestamp': 1603306526, + 'description': 'md5:6e88cde8af2068444fc8e1bc3ebf257c', + 'upload_date': '20201021', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }] + + def _real_extract(self, url): + item_id = self._match_id(url) + custom_id = self._download_json( + 'https://cdn.mewatch.sg/api/items/' + item_id, + item_id, query={'segments': 'all'})['customId'] + return self.url_result( + 'toggle:' + custom_id, ToggleIE.ie_key(), custom_id) diff --git a/youtube_dlc/extractor/tva.py b/youtube_dlc/extractor/tva.py index 443f46e8a..52a4ddf32 100644 --- a/youtube_dlc/extractor/tva.py +++ b/youtube_dlc/extractor/tva.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( float_or_none, + int_or_none, smuggle_url, + strip_or_none, ) @@ -23,7 +25,8 @@ class TVAIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'https://video.tva.ca/details/_5596811470001', 'only_matching': True, @@ -32,26 +35,54 @@ class TVAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={ - 'Accept': 'application/json', - }, query={ - 'appId': '5955fc5f23eec60006c951f1', - }) - - def get_attribute(key): - for attribute in video_data.get('attributes', []): - if attribute.get('key') == key: - return attribute.get('value') - return None return { '_type': 'url_transparent', 'id': video_id, - 'title': get_attribute('title'), 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), - 'description': get_attribute('description'), - 'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'), - 'duration': float_or_none(get_attribute('video-duration'), 1000), 'ie_key': 'BrightcoveNew', } + + +class QubIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619', + 'md5': '949490fd0e7aee11d0543777611fbd53', + 'info_dict': { + 'id': '6084352463001', + 'ext': 'mp4', + 'title': 'Épisode 01', + 'uploader_id': '5481942443001', + 'upload_date': '20190907', + 'timestamp': 1567899756, + 'description': 'md5:9c0d7fbb90939420c651fd977df90145', + }, + }, { + 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', + 'only_matching': True, + }] + # reference_id also works with old account_id(5481942443001) + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + entity_id = self._match_id(url) + entity = self._download_json( + 'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities', + entity_id, query={'id': entity_id}) + video_id = entity['videoId'] + episode = strip_or_none(entity.get('name')) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': episode, + # 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'], + 'url': 'https://videos.tva.ca/details/_' + video_id, + 'description': entity.get('longDescription'), + 'duration': float_or_none(entity.get('durationMillis'), 1000), + 'episode': episode, + 'episode_number': int_or_none(entity.get('episodeNumber')), + # 'ie_key': 'BrightcoveNew', + 'ie_key': TVAIE.ie_key(), + } diff --git a/youtube_dlc/extractor/tver.py b/youtube_dlc/extractor/tver.py new file mode 100644 index 000000000..931d4d650 --- /dev/null +++ b/youtube_dlc/extractor/tver.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + remove_start, + smuggle_url, + try_get, +) + + +class TVerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P(?:corner|episode|feature)/(?Pf?\d+))' + # videos are only available for 7 days + _TESTS = [{ + 'url': 'https://tver.jp/corner/f0062178', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/feature/f0062413', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/episode/79622438', + 'only_matching': True, + }] + _TOKEN = None + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_initialize(self): + self._TOKEN = self._download_json( + 'https://tver.jp/api/access_token.php', None)['token'] + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + main = self._download_json( + 'https://api.tver.jp/v4/' + path, video_id, + query={'token': self._TOKEN})['main'] + p_id = main['publisher_id'] + service = remove_start(main['service'], 'ts_') + info = { + '_type': 'url_transparent', + 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), + 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + } + + if service == 'cx': + info.update({ + 'title': main.get('subtitle') or main['title'], + 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), + 'ie_key': 'FujiTVFODPlus7', + }) + else: + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + info.update({ + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + }) + + return info diff --git a/youtube_dlc/extractor/videa.py b/youtube_dlc/extractor/videa.py index a03614cc1..ab2c15cde 100644 --- a/youtube_dlc/extractor/videa.py +++ b/youtube_dlc/extractor/videa.py @@ -1,10 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import random +import re import string -import struct from .common import InfoExtractor from ..utils import ( @@ -12,13 +11,14 @@ from ..utils import ( int_or_none, mimetype2ext, parse_codecs, + update_url_query, xpath_element, xpath_text, ) from ..compat import ( compat_b64decode, compat_ord, - compat_parse_qs, + compat_struct_pack, ) @@ -28,7 +28,7 @@ class VideaIE(InfoExtractor): videa(?:kid)?\.hu/ (?: videok/(?:[^/]+/)*[^?#&]+-| - player\?.*?\bv=| + (?:videojs_)?player\?.*?\bv=| player/v/ ) (?P[^?#&]+) @@ -62,6 +62,7 @@ class VideaIE(InfoExtractor): 'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', 'only_matching': True, }] + _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' @staticmethod def _extract_urls(webpage): @@ -69,75 +70,84 @@ class VideaIE(InfoExtractor): r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', webpage)] - def rc4(self, ciphertext, key): + @staticmethod + def rc4(cipher_text, key): res = b'' - keyLen = len(key) + key_len = len(key) S = list(range(256)) j = 0 for i in range(256): - j = (j + S[i] + ord(key[i % keyLen])) % 256 + j = (j + S[i] + ord(key[i % key_len])) % 256 S[i], S[j] = S[j], S[i] i = 0 j = 0 - for m in range(len(ciphertext)): + for m in range(len(cipher_text)): i = (i + 1) % 256 j = (j + S[i]) % 256 S[i], S[j] = S[j], S[i] k = S[(S[i] + S[j]) % 256] - res += struct.pack("B", k ^ compat_ord(ciphertext[m])) + res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) - return res + return res.decode() def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, fatal=True) - error = self._search_regex(r'

    ([^<]+)

    ', webpage, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) - - video_src_params_raw = self._search_regex(r']+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params') - video_src_params = compat_parse_qs(video_src_params_raw) - player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True) - nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') - random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) - static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + query = {'v': video_id} + player_page = self._download_webpage( + 'https://videa.hu/player', video_id, query=query) + + nonce = self._search_regex( + r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') l = nonce[:32] s = nonce[32:] result = '' for i in range(0, 32): - result += s[i - (static_secret.index(l[i]) - 31)] - - video_src_params['_s'] = random_seed - video_src_params['_t'] = result[:16] - encryption_key_stem = result[16:] + random_seed - - [b64_info, handle] = self._download_webpage_handle( - 'http://videa.hu/videaplayer_get_xml.php', video_id, - query=video_src_params, fatal=True) - - encrypted_info = compat_b64decode(b64_info) - key = encryption_key_stem + handle.info()['x-videa-xs'] - info_str = self.rc4(encrypted_info, key).decode('utf8') - info = self._parse_xml(info_str, video_id) - - video = xpath_element(info, './/video', 'video', fatal=True) - sources = xpath_element(info, './/video_sources', 'sources', fatal=True) - hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True) + result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] + + random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) + query['_s'] = random_seed + query['_t'] = result[:16] + + b64_info, handle = self._download_webpage_handle( + 'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) + if b64_info.startswith('\d+)/track/(?P\d+)' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/album/(?P\d+)/track/(?P\d+)' _TESTS = [{ 'url': 'http://music.yandex.ru/album/540508/track/4878838', - 'md5': 'f496818aa2f60b6c0062980d2e00dc20', + 'md5': 'dec8b661f12027ceaba33318787fff76', 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', - 'filesize': 4628061, + 'title': 'md5:c63e19341fdbe84e43425a30bc777856', + 'filesize': int, 'duration': 193.04, - 'track': 'Gypsy Eyes 1', - 'album': 'Gypsy Soul', - 'album_artist': 'Carlo Ambrosio', - 'artist': 'Carlo Ambrosio & Fabio Di Bari', + 'track': 'md5:210508c6ffdfd67a493a6c378f22c3ff', + 'album': 'md5:cd04fb13c4efeafdfa0a6a6aca36d01a', + 'album_artist': 'md5:5f54c35462c07952df33d97cfb5fc200', + 'artist': 'md5:e6fd86621825f14dc0b25db3acd68160', 'release_year': 2009, }, - 'skip': 'Travis CI servers blocked by YandexMusic', + # 'skip': 'Travis CI servers blocked by YandexMusic', }, { # multiple disks 'url': 'http://music.yandex.ru/album/3840501/track/705105', - 'md5': 'ebe7b4e2ac7ac03fe11c19727ca6153e', + 'md5': '82a54e9e787301dd45aba093cf6e58c0', 'info_dict': { 'id': '705105', 'ext': 'mp3', - 'title': 'Hooverphonic - Sometimes', - 'filesize': 5743386, + 'title': 'md5:f86d4a9188279860a83000277024c1a6', + 'filesize': int, 'duration': 239.27, - 'track': 'Sometimes', - 'album': 'The Best of Hooverphonic', - 'album_artist': 'Hooverphonic', - 'artist': 'Hooverphonic', + 'track': 'md5:40f887f0666ba1aa10b835aca44807d1', + 'album': 'md5:624f5224b14f5c88a8e812fd7fbf1873', + 'album_artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12', + 'artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12', 'release_year': 2016, 'genre': 'pop', 'disc_number': 2, 'track_number': 9, }, - 'skip': 'Travis CI servers blocked by YandexMusic', + # 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - album_id, track_id = mobj.group('album_id'), mobj.group('id') + tld, album_id, track_id = mobj.group('tld'), mobj.group('album_id'), mobj.group('id') - track = self._download_json( - 'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id), - track_id, 'Downloading track JSON')['track'] + track = self._call_api( + 'track', tld, url, track_id, 'Downloading track JSON', + {'track': '%s:%s' % (track_id, album_id)})['track'] track_title = track['title'] download_data = self._download_json( @@ -109,8 +121,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'Downloading track location JSON', query={'format': 'json'}) key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest() - storage = track['storageDir'].split('.') - f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], storage[1]) + f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], track['id']) thumbnail = None cover_uri = track.get('albums', [{}])[0].get('coverUri') @@ -180,42 +191,85 @@ class YandexMusicTrackIE(YandexMusicBaseIE): class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): + def _extract_tracks(self, source, item_id, url, tld): + tracks = source['tracks'] + track_ids = [compat_str(track_id) for track_id in source['trackIds']] + + # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, + # missing tracks should be retrieved manually. + if len(tracks) < len(track_ids): + present_track_ids = set([ + compat_str(track['id']) + for track in tracks if track.get('id')]) + missing_track_ids = [ + track_id for track_id in track_ids + if track_id not in present_track_ids] + missing_tracks = self._call_api( + 'track-entries', tld, url, item_id, + 'Downloading missing tracks JSON', { + 'entries': ','.join(missing_track_ids), + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + 'strict': 'true', + }) + if missing_tracks: + tracks.extend(missing_tracks) + + return tracks + def _build_playlist(self, tracks): - return [ - self.url_result( - 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id'])) - for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)] + entries = [] + for track in tracks: + track_id = track.get('id') or track.get('realId') + if not track_id: + continue + albums = track.get('albums') + if not albums or not isinstance(albums, list): + continue + album = albums[0] + if not isinstance(album, dict): + continue + album_id = album.get('id') + if not album_id: + continue + entries.append(self.url_result( + 'http://music.yandex.ru/album/%s/track/%s' % (album_id, track_id), + ie=YandexMusicTrackIE.ie_key(), video_id=track_id)) + return entries class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:album' IE_DESC = 'Яндекс.Музыка - Альбом' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/?(\?|$)' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/album/(?P\d+)/?(\?|$)' _TESTS = [{ 'url': 'http://music.yandex.ru/album/540508', 'info_dict': { 'id': '540508', - 'title': 'Carlo Ambrosio - Gypsy Soul (2009)', + 'title': 'md5:7ed1c3567f28d14be9f61179116f5571', }, 'playlist_count': 50, - 'skip': 'Travis CI servers blocked by YandexMusic', + # 'skip': 'Travis CI servers blocked by YandexMusic', }, { 'url': 'https://music.yandex.ru/album/3840501', 'info_dict': { 'id': '3840501', - 'title': 'Hooverphonic - The Best of Hooverphonic (2016)', + 'title': 'md5:36733472cdaa7dcb1fd9473f7da8e50f', }, 'playlist_count': 33, - 'skip': 'Travis CI servers blocked by YandexMusic', + # 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): - album_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + album_id = mobj.group('id') - album = self._download_json( - 'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id, - album_id, 'Downloading album JSON') + album = self._call_api( + 'album', tld, url, album_id, 'Downloading album JSON', + {'album': album_id}) entries = self._build_playlist([track for volume in album['volumes'] for track in volume]) @@ -236,21 +290,24 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', 'info_dict': { 'id': '1245', - 'title': 'Что слушают Enter Shikari', + 'title': 'md5:841559b3fe2b998eca88d0d2e22a3097', 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, - 'playlist_count': 6, - 'skip': 'Travis CI servers blocked by YandexMusic', + 'playlist_count': 5, + # 'skip': 'Travis CI servers blocked by YandexMusic', }, { - # playlist exceeding the limit of 150 tracks shipped with webpage (see - # https://github.com/ytdl-org/youtube-dl/issues/6666) 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', + 'only_matching': True, + }, { + # playlist exceeding the limit of 150 tracks (see + # https://github.com/ytdl-org/youtube-dl/issues/6666) + 'url': 'https://music.yandex.ru/users/mesiaz/playlists/1364', 'info_dict': { - 'id': '1036', - 'title': 'Музыка 90-х', + 'id': '1364', + 'title': 'md5:b3b400f997d3f878a13ae0699653f7db', }, - 'playlist_mincount': 300, - 'skip': 'Travis CI servers blocked by YandexMusic', + 'playlist_mincount': 437, + # 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): @@ -259,16 +316,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): user = mobj.group('user') playlist_id = mobj.group('id') - playlist = self._download_json( - 'https://music.yandex.%s/handlers/playlist.jsx' % tld, - playlist_id, 'Downloading missing tracks JSON', - fatal=False, - headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - 'X-Retpath-Y': url, - }, - query={ + playlist = self._call_api( + 'playlist', tld, url, playlist_id, 'Downloading playlist JSON', { 'owner': user, 'kinds': playlist_id, 'light': 'true', @@ -277,37 +326,103 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'overembed': 'false', })['playlist'] - tracks = playlist['tracks'] - track_ids = [compat_str(track_id) for track_id in playlist['trackIds']] - - # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, - # missing tracks should be retrieved manually. - if len(tracks) < len(track_ids): - present_track_ids = set([ - compat_str(track['id']) - for track in tracks if track.get('id')]) - missing_track_ids = [ - track_id for track_id in track_ids - if track_id not in present_track_ids] - missing_tracks = self._download_json( - 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, - playlist_id, 'Downloading missing tracks JSON', - fatal=False, - headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }, - query={ - 'entries': ','.join(missing_track_ids), - 'lang': tld, - 'external-domain': 'music.yandex.%s' % tld, - 'overembed': 'false', - 'strict': 'true', - }) - if missing_tracks: - tracks.extend(missing_tracks) + tracks = self._extract_tracks(playlist, playlist_id, url, tld) return self.playlist_result( self._build_playlist(tracks), compat_str(playlist_id), playlist.get('title'), playlist.get('description')) + + +class YandexMusicArtistBaseIE(YandexMusicPlaylistBaseIE): + def _call_artist(self, tld, url, artist_id): + return self._call_api( + 'artist', tld, url, artist_id, + 'Downloading artist %s JSON' % self._ARTIST_WHAT, { + 'artist': artist_id, + 'what': self._ARTIST_WHAT, + 'sort': self._ARTIST_SORT or '', + 'dir': '', + 'period': '', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + }) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + tracks = self._extract_tracks(data, artist_id, url, tld) + title = try_get(data, lambda x: x['artist']['name'], compat_str) + return self.playlist_result( + self._build_playlist(tracks), artist_id, title) + + +class YandexMusicArtistTracksIE(YandexMusicArtistBaseIE): + IE_NAME = 'yandexmusic:artist:tracks' + IE_DESC = 'Яндекс.Музыка - Артист - Треки' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/artist/(?P\d+)/tracks' + + _TESTS = [{ + 'url': 'https://music.yandex.ru/artist/617526/tracks', + 'info_dict': { + 'id': '617526', + 'title': 'md5:131aef29d45fd5a965ca613e708c040b', + }, + 'playlist_count': 507, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }] + + _ARTIST_SORT = '' + _ARTIST_WHAT = 'tracks' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + tracks = self._extract_tracks(data, artist_id, url, tld) + artist = try_get(data, lambda x: x['artist']['name'], compat_str) + title = '%s - %s' % (artist or artist_id, 'Треки') + return self.playlist_result( + self._build_playlist(tracks), artist_id, title) + + +class YandexMusicArtistAlbumsIE(YandexMusicArtistBaseIE): + IE_NAME = 'yandexmusic:artist:albums' + IE_DESC = 'Яндекс.Музыка - Артист - Альбомы' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/artist/(?P\d+)/albums' + + _TESTS = [{ + 'url': 'https://music.yandex.ru/artist/617526/albums', + 'info_dict': { + 'id': '617526', + 'title': 'md5:55dc58d5c85699b7fb41ee926700236c', + }, + 'playlist_count': 8, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }] + + _ARTIST_SORT = 'year' + _ARTIST_WHAT = 'albums' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + entries = [] + for album in data['albums']: + if not isinstance(album, dict): + continue + album_id = album.get('id') + if not album_id: + continue + entries.append(self.url_result( + 'http://music.yandex.ru/album/%s' % album_id, + ie=YandexMusicAlbumIE.ie_key(), video_id=album_id)) + artist = try_get(data, lambda x: x['artist']['name'], compat_str) + title = '%s - %s' % (artist or artist_id, 'Альбомы') + return self.playlist_result(entries, artist_id, title) diff --git a/youtube_dlc/extractor/zdf.py b/youtube_dlc/extractor/zdf.py index 7b5ad4a6e..d9b393e6e 100644 --- a/youtube_dlc/extractor/zdf.py +++ b/youtube_dlc/extractor/zdf.py @@ -41,7 +41,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): IE_NAME = "ZDF-3sat" _VALID_URL = r'https?://www\.(zdf|3sat)\.de/(?:[^/]+/)*(?P[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -131,7 +131,7 @@ class ZDFIE(ZDFBaseIE): if not ptmd_path: ptmd_path = t[ 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'portal') + '{playerId}', 'ngplayer_2_4') ptmd = self._call_api( urljoin(url, ptmd_path), player, url, video_id, 'metadata')