From c7a13929f8aec1c7709c0947ee2904aa84fe5d53 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 7 Dec 2020 16:09:42 +0530 Subject: [PATCH] Update to release 2020.12.07 --- docs/supportedsites.md | 3 + youtube_dlc/extractor/aenetworks.py | 271 ++++++++++++++++------------ youtube_dlc/extractor/common.py | 8 +- youtube_dlc/extractor/extractors.py | 3 + youtube_dlc/extractor/generic.py | 8 +- youtube_dlc/extractor/youtube.py | 55 ++++-- 6 files changed, 211 insertions(+), 137 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 083c0ee6d..3931de67d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -34,6 +34,8 @@ - **adobetv:video** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault + - **aenetworks:collection** + - **aenetworks:show** - **afreecatv**: afreecatv.com - **AirMozilla** - **AliExpressLive** @@ -1176,6 +1178,7 @@ - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) - **youtube:tab**: YouTube.com tab - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YoutubeYtBe**: youtu.be - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword - **Zapiks** - **Zaq1** diff --git a/youtube_dlc/extractor/aenetworks.py b/youtube_dlc/extractor/aenetworks.py index 611b948f5..3d0cf1208 100644 --- a/youtube_dlc/extractor/aenetworks.py +++ b/youtube_dlc/extractor/aenetworks.py @@ -5,20 +5,30 @@ import re from .theplatform import ThePlatformIE from ..utils import ( - extract_attributes, ExtractorError, int_or_none, - smuggle_url, update_url_query, -) -from ..compat import ( - compat_urlparse, + urlencode_postdata, ) class AENetworksBaseIE(ThePlatformIE): + _BASE_URL_REGEX = r'''(?x)https?:// + (?:(?:www|play|watch)\.)? + (?P + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/''' _THEPLATFORM_KEY = 'crazyjava' _THEPLATFORM_SECRET = 's3cr3t' + _DOMAIN_MAP = { + 'history.com': ('HISTORY', 'history'), + 'aetv.com': ('AETV', 'aetv'), + 'mylifetime.com': ('LIFETIME', 'lifetime'), + 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), + 'fyi.tv': ('FYI', 'fyi'), + 'historyvault.com': (None, 'historyvault'), + } def _extract_aen_smil(self, smil_url, video_id, auth=None): query = {'mbr': 'true'} @@ -31,7 +41,7 @@ class AENetworksBaseIE(ThePlatformIE): 'assetTypes': 'high_video_s3' }, { 'assetTypes': 'high_video_s3', - 'switch': 'hls_ingest_fastly' + 'switch': 'hls_high_fastly', }] formats = [] subtitles = {} @@ -61,20 +71,13 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P - (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| - fyi\.tv - )/ - (?: - shows/(?P[^/]+(?:/[^/]+){0,2})| - movies/(?P[^/]+)(?:/full-movie)?| - specials/(?P[^/]+)/(?:full-special|preview-)| - collections/[^/]+/(?P[^/]+) - ) - ''' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P + shows/[^/]+/season-\d+/episode-\d+| + (?: + (?:movie|special)s/[^/]+| + (?:shows/[^/]+/)?videos + )/[^/?#&]+ + )''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'info_dict': { @@ -91,22 +94,23 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { - 'url': 'http://www.history.com/shows/ancient-aliens/season-1', + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { - 'id': '71889446852', + 'id': '600587331957', + 'ext': 'mp4', + 'title': 'Inlawful Entry', + 'description': 'md5:57c12115a2b384d883fe64ca50529e08', + 'timestamp': 1452634428, + 'upload_date': '20160112', + 'uploader': 'AENE-NEW', }, - 'playlist_mincount': 5, - }, { - 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', - 'info_dict': { - 'id': 'SERIES4317', - 'title': 'Atlanta Plastic', + 'params': { + # m3u8 download + 'skip_download': True, }, - 'playlist_mincount': 2, - }, { - 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', - 'only_matching': True + 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -117,80 +121,152 @@ class AENetworksIE(AENetworksBaseIE): 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }, { - 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', 'only_matching': True }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True }, { - 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', + 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', 'only_matching': True }, { - 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', + 'url': 'http://www.history.com/videos/history-of-valentines-day', + 'only_matching': True + }, { + 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', 'only_matching': True }] - _DOMAIN_TO_REQUESTOR_ID = { - 'history.com': 'HISTORY', - 'aetv.com': 'AETV', - 'mylifetime.com': 'LIFETIME', - 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', - 'fyi.tv': 'FYI', - } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id or collection_display_id - webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) - if show_path: - url_parts = show_path.split('/') - url_parts_len = len(url_parts) - if url_parts_len == 1: - entries = [] - for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - if entries: - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - else: - # single season - url_parts_len = 2 - if url_parts_len == 2: - entries = [] - for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): - episode_attributes = extract_attributes(episode_item) - episode_url = compat_urlparse.urljoin( - url, episode_attributes['data-canonical']) - entries.append(self.url_result( - episode_url, 'AENetworks', - episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeasonId', webpage)) - - video_id = self._html_search_meta('aetn:VideoID', webpage) - media_url = self._search_regex( - [r"media_url\s*=\s*'(?P[^']+)'", - r'data-media-url=(?P(?:https?:)?//[^\s>]+)', - r'data-media-url=(["\'])(?P(?:(?!\1).)+?)\1'], - webpage, 'video url', group='url') + domain, canonical = re.match(self._VALID_URL, url).groups() + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + canonical, query={'filter[canonical]': '/' + canonical})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] theplatform_metadata = self._download_theplatform_metadata(self._search_regex( r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) info = self._parse_theplatform_metadata(theplatform_metadata) auth = None if theplatform_metadata.get('AETN$isBehindWall'): - requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] resource = self._get_mvpd_resource( requestor_id, theplatform_metadata['title'], theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), theplatform_metadata['ratings'][0]['rating']) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) - info.update(self._search_json_ld(webpage, video_id, fatal=False)) info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) return info +class AENetworksListBaseIE(AENetworksBaseIE): + def _call_api(self, resource, slug, brand, fields): + return self._download_json( + 'https://yoga.appsvcs.aetnd.com/graphql', + slug, query={'brand': brand}, data=urlencode_postdata({ + 'query': '''{ + %s(slug: "%s") { + %s + } +}''' % (resource, slug, fields), + }))['data'][resource] + + def _real_extract(self, url): + domain, slug = re.match(self._VALID_URL, url).groups() + _, brand = self._DOMAIN_MAP[domain] + playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) + base_url = 'http://watch.%s' % domain + + entries = [] + for item in (playlist.get(self._ITEMS_KEY) or []): + doc = self._get_doc(item) + canonical = doc.get('canonical') + if not canonical: + continue + entries.append(self.url_result( + base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) + + description = None + if self._PLAYLIST_DESCRIPTION_KEY: + description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) + + return self.playlist_result( + entries, playlist.get('id'), + playlist.get(self._PLAYLIST_TITLE_KEY), description) + + +class AENetworksCollectionIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:collection' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', + 'info_dict': { + 'id': '282', + 'title': 'America The Story of Us', + }, + 'playlist_mincount': 12, + }, { + 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/mysteryquest', + 'only_matching': True + }] + _RESOURCE = 'list' + _ITEMS_KEY = 'items' + _PLAYLIST_TITLE_KEY = 'display_title' + _PLAYLIST_DESCRIPTION_KEY = None + _FIELDS = '''id + display_title + items { + ... on ListVideoItem { + doc { + canonical + id + } + } + }''' + + def _get_doc(self, item): + return item.get('doc') or {} + + +class AENetworksShowIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:show' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/ancient-aliens', + 'info_dict': { + 'id': 'SH012427480000', + 'title': 'Ancient Aliens', + 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', + }, + 'playlist_mincount': 168, + }] + _RESOURCE = 'series' + _ITEMS_KEY = 'episodes' + _PLAYLIST_TITLE_KEY = 'title' + _PLAYLIST_DESCRIPTION_KEY = 'description' + _FIELDS = '''description + id + title + episodes { + canonical + id + }''' + + def _get_doc(self, item): + return item + + class HistoryTopicIE(AENetworksBaseIE): IE_NAME = 'history:topic' IE_DESC = 'History.com Topic' @@ -204,6 +280,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', 'timestamp': 1375819729, 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -212,36 +289,8 @@ class HistoryTopicIE(AENetworksBaseIE): 'add_ie': ['ThePlatform'], }] - def theplatform_url_result(self, theplatform_url, video_id, query): - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url( - update_url_query(theplatform_url, query), - { - 'sig': { - 'key': self._THEPLATFORM_KEY, - 'secret': self._THEPLATFORM_SECRET, - }, - 'force_smil_url': True - }), - 'ie_key': 'ThePlatform', - } - def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r']+src="[^"]+\btpid=(\d+)', webpage, 'tpid') - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/history/videos', - video_id, query={'filter[id]': video_id})['results'][0] - title = result['title'] - info = self._extract_aen_smil(result['publicUrl'], video_id) - info.update({ - 'title': title, - 'description': result.get('description'), - 'duration': int_or_none(result.get('duration')), - 'timestamp': int_or_none(result.get('added'), 1000), - }) - return info + return self.url_result( + 'http://www.history.com/videos/' + display_id, + AENetworksIE.ie_key()) diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index a5df94e9c..44b914247 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -2514,16 +2514,18 @@ class InfoExtractor(object): # amp-video and amp-audio are very similar to their HTML5 counterparts # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) + # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ + _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' media_tags = [(media_tag, media_type, '') for media_tag, media_type - in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] + in re.findall(r'(?s)(<%s[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?P(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)', webpage)) - for media_tag, media_type, media_content in media_tags: + r'(?s)(<(?P%s)(?:\s+[^>]*)?>)(.*?)' % _MEDIA_TAG_NAME_RE, webpage)) + for media_tag, _, media_type, media_content in media_tags: media_info = { 'formats': [], 'subtitles': {}, diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 0b2b234b6..f49b998fa 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -30,6 +30,8 @@ from .adobetv import ( from .adultswim import AdultSwimIE from .aenetworks import ( AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, HistoryTopicIE, ) from .afreecatv import AfreecaTVIE @@ -1557,6 +1559,7 @@ from .youtube import ( YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, + YoutubeYtBeIE, YoutubeYtUserIE, YoutubeWatchLaterIE, ) diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index 965c25be6..83cf204cb 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -29,7 +29,7 @@ from ..utils import ( sanitized_Request, smuggle_url, unescapeHTML, - unified_strdate, + unified_timestamp, unsmuggle_url, UnsupportedError, xpath_text, @@ -2330,7 +2330,7 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) } # Check for direct link to a video @@ -2436,7 +2436,9 @@ class GenericIE(InfoExtractor): # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way - webpage = compat_urllib_parse_unquote(webpage) + # FIXME: unescaping the whole page may break URLs, commenting out for now. + # There probably should be a second run of generic extractor on unescaped webpage. + # webpage = compat_urllib_parse_unquote(webpage) # Unescape squarespace embeds to be detected by generic extractor, # see https://github.com/ytdl-org/youtube-dl/issues/21294 diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 625fdb899..b08ac6655 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3318,8 +3318,7 @@ class YoutubePlaylistIE(InfoExtractor): (?: (?: youtube(?:kids)?\.com| - invidio\.us| - youtu\.be + invidio\.us ) /.*?\?.*?\blist= )? @@ -3364,6 +3363,32 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', } }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeTabIE.suitable(url) else super( + YoutubePlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if not qs: + qs = {'list': playlist_id} + return self.url_result( + update_url_query('https://www.youtube.com/playlist', qs), + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + + +class YoutubeYtBeIE(InfoExtractor): + _VALID_URL = r'https?://youtu\.be/(?P[0-9A-Za-z_-]{11})/*?.*?\blist=(?P%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + _TESTS = [{ 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', 'info_dict': { 'id': 'yeWKywCrFtk', @@ -3386,28 +3411,18 @@ class YoutubePlaylistIE(InfoExtractor): }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, - }, { - 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', - 'only_matching': True, - }, { - # music album playlist - 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', - 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if YoutubeTabIE.suitable(url) else super( - YoutubePlaylistIE, cls).suitable(url) - def _real_extract(self, url): - playlist_id = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if not qs: - qs = {'list': playlist_id} + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + playlist_id = mobj.group('playlist_id') return self.url_result( - update_url_query('https://www.youtube.com/playlist', qs), - ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + update_url_query('https://www.youtube.com/watch', { + 'v': video_id, + 'list': playlist_id, + 'feature': 'youtu.be', + }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) class YoutubeYtUserIE(InfoExtractor):