From b1ef860624147c61e6a0d62a02f50e6cab019eac Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 12 Dec 2020 19:00:36 +0530 Subject: [PATCH] Update to release 2020.12.12 --- youtube_dlc/YoutubeDL.py | 2 +- youtube_dlc/extractor/facebook.py | 241 +++++++++++++++++------ youtube_dlc/extractor/itv.py | 307 +++++++++--------------------- youtube_dlc/extractor/ruutu.py | 92 ++++++++- youtube_dlc/extractor/wdr.py | 37 +++- 5 files changed, 392 insertions(+), 287 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index a5c8e1df6..32766ea61 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -2505,7 +2505,7 @@ class YoutubeDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % diff --git a/youtube_dlc/extractor/facebook.py b/youtube_dlc/extractor/facebook.py index 610d66745..147deccc9 100644 --- a/youtube_dlc/extractor/facebook.py +++ b/youtube_dlc/extractor/facebook.py @@ -16,14 +16,17 @@ from ..utils import ( clean_html, error_to_compat_str, ExtractorError, + float_or_none, get_element_by_id, int_or_none, js_to_json, limit_length, parse_count, + qualities, sanitized_Request, try_get, urlencode_postdata, + urljoin, ) @@ -39,7 +42,8 @@ class FacebookIE(InfoExtractor): photo\.php| video\.php| video/embed| - story\.php + story\.php| + watch(?:/live)?/? )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -54,8 +58,6 @@ class FacebookIE(InfoExtractor): _NETRC_MACHINE = 'facebook' IE_NAME = 'facebook' - _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' - _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' @@ -72,6 +74,7 @@ class FacebookIE(InfoExtractor): }, 'skip': 'Requires logging in', }, { + # data.video 'url': 'https://www.facebook.com/video.php?v=274175099429670', 'info_dict': { 'id': '274175099429670', @@ -133,6 +136,7 @@ class FacebookIE(InfoExtractor): }, }, { # have 1080P, but only up to 720p in swf params + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { @@ -147,6 +151,7 @@ class FacebookIE(InfoExtractor): }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', 'info_dict': { 'id': '1417995061575415', @@ -174,6 +179,7 @@ class FacebookIE(InfoExtractor): 'skip_download': True, }, }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 'info_dict': { 'id': '1396382447100162', @@ -193,18 +199,23 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 'only_matching': True, }, { + # data.mediaset.currMedia.edges 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', 'only_matching': True, }, { + # data.video.story.attachments[].media 'url': 'facebook:544765982287235', 'only_matching': True, }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, }, { + # data.video.creation_story.attachments[].media 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'only_matching': True, }, { + # data.video 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', 'only_matching': True, }, { @@ -212,6 +223,7 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, }, { + # data.video 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 'info_dict': { 'id': '359649331226507', @@ -222,7 +234,54 @@ class FacebookIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', + 'info_dict': { + 'id': '106560053808006', + }, + 'playlist_count': 2, + }, { + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/watch/?v=647537299265662', + 'only_matching': True, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', + 'info_dict': { + 'id': '10157667649866271', + }, + 'playlist_count': 3, + }, { + # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media + 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', + 'info_dict': { + 'id': '117576630041613', + 'ext': 'mp4', + # TODO: title can be extracted from video page + 'title': 'Facebook video #117576630041613', + 'uploader_id': '189393014416438', + 'upload_date': '20201123', + 'timestamp': 1606162592, + }, + 'skip': 'Requires logging in', + }, { + # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media + 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', + 'info_dict': { + 'id': '211567722618337', + 'ext': 'mp4', + 'title': 'Facebook video #211567722618337', + 'uploader_id': '127875227654254', + 'upload_date': '20161122', + 'timestamp': 1479793574, + }, + }, { + # data.video.creation_story.attachments[].media + 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', + 'only_matching': True, }] + _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' @staticmethod def _extract_urls(webpage): @@ -305,23 +364,24 @@ class FacebookIE(InfoExtractor): def _real_initialize(self): self._login() - def _extract_from_url(self, url, video_id, fatal_if_no_video=True): - req = sanitized_Request(url) - req.add_header('User-Agent', self._CHROME_USER_AGENT) - webpage = self._download_webpage(req, video_id) + def _extract_from_url(self, url, video_id): + webpage = self._download_webpage( + url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) video_data = None def extract_video_data(instances): + video_data = [] for item in instances: - if item[1][0] == 'VideoConfig': + if try_get(item, lambda x: x[1][0]) == 'VideoConfig': video_item = item[2][0] if video_item.get('video_id'): - return video_item['videoData'] + video_data.append(video_item['videoData']) + return video_data server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})(?:\);|,")', webpage, - 'server js data', default='{}'), video_id, fatal=False) + [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], + webpage, 'server js data', default='{}'), video_id, fatal=False) if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) @@ -331,17 +391,111 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) + def extract_dash_manifest(video, formats): + dash_manifest = video.get('dash_manifest') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + + def process_formats(formats): + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in formats: + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + + self._sort_formats(formats) + if not video_data: - server_js_data = self._parse_json( - self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', - webpage, 'js data', default='{}'), - video_id, transform_source=js_to_json, fatal=False) + server_js_data = self._parse_json(self._search_regex([ + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, + r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX + ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) if not video_data: - if not fatal_if_no_video: - return webpage, False + graphql_data = self._parse_json(self._search_regex( + r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);', + webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} + for require in (graphql_data.get('require') or []): + if require[0] == 'RelayPrefetchedStreamCache': + entries = [] + + def parse_graphql_video(video): + formats = [] + q = qualities(['sd', 'hd']) + for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: + playable_url = video.get('playable_url' + suffix) + if not playable_url: + continue + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) + extract_dash_manifest(video, formats) + process_formats(formats) + v_id = video.get('videoId') or video.get('id') or video_id + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'timestamp': int_or_none(video.get('publish_time')), + 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + } + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or 'Facebook video #%s' % v_id + entries.append(info) + + def parse_attachment(attachment, key='media'): + media = attachment.get(key) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) + + data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + + nodes = data.get('nodes') or [] + node = data.get('node') or {} + if not nodes and node: + nodes.append(node) + for node in nodes: + story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} + attachments = try_get(story, [ + lambda x: x['attached_story']['attachments'], + lambda x: x['attachments'] + ], list) or [] + for attachment in attachments: + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) + ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for n in ns: + parse_attachment(n) + parse_attachment(attachment) + + edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] + for edge in edges: + parse_attachment(edge, key='node') + + video = data.get('video') or {} + if video: + attachments = try_get(video, [ + lambda x: x['story']['attachments'], + lambda x: x['creation_story']['attachments'] + ], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: + parse_graphql_video(video) + + return self.playlist_result(entries, video_id) + + if not video_data: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) if m_msg is not None: raise ExtractorError( @@ -379,8 +533,19 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') - subtitles = {} + if len(video_data) > 1: + entries = [] + for v in video_data: + video_url = v[0].get('video_url') + if not video_url: + continue + entries.append(self.url_result(urljoin( + url, video_url), self.ie_key(), v[0].get('video_id'))) + return self.playlist_result(entries, video_id) + video_data = video_data[0] + formats = [] + subtitles = {} for f in video_data: format_id = f['stream_type'] if f and isinstance(f, dict): @@ -399,22 +564,14 @@ class FacebookIE(InfoExtractor): 'url': src, 'preference': preference, }) - dash_manifest = f[0].get('dash_manifest') - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + extract_dash_manifest(f[0], formats) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) if not formats: raise ExtractorError('Cannot find video formats') - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - for f in formats: - f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - - self._sort_formats(formats) + process_formats(formats) video_title = self._html_search_regex( r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage, @@ -454,35 +611,13 @@ class FacebookIE(InfoExtractor): 'subtitles': subtitles, } - return webpage, info_dict + return info_dict def _real_extract(self, url): video_id = self._match_id(url) real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url - webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False) - - if info_dict: - return info_dict - - if '/posts/' in url: - video_id_json = self._search_regex( - r'(["\'])video_ids\1\s*:\s*(?P\[.+?\])', webpage, 'video ids', group='ids', - default='') - if video_id_json: - entries = [ - self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) - for vid in self._parse_json(video_id_json, video_id)] - return self.playlist_result(entries, video_id) - - # Single Video? - video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id') - return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) - else: - _, info_dict = self._extract_from_url( - self._VIDEO_PAGE_TEMPLATE % video_id, - video_id, fatal_if_no_video=True) - return info_dict + return self._extract_from_url(real_url, video_id) class FacebookPluginsVideoIE(InfoExtractor): diff --git a/youtube_dlc/extractor/itv.py b/youtube_dlc/extractor/itv.py index 20144cd82..cef38b0fa 100644 --- a/youtube_dlc/extractor/itv.py +++ b/youtube_dlc/extractor/itv.py @@ -1,30 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals -import uuid -import xml.etree.ElementTree as etree import json import re from .common import InfoExtractor from .brightcove import BrightcoveNewIE -from ..compat import ( - compat_str, - compat_etree_register_namespace, -) from ..utils import ( determine_ext, - ExtractorError, extract_attributes, - int_or_none, + get_element_by_class, + JSON_LD_RE, merge_dicts, parse_duration, smuggle_url, + strip_or_none, try_get, url_or_none, - xpath_with_ns, - xpath_element, - xpath_text, ) @@ -32,14 +24,18 @@ class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)' _GEO_COUNTRIES = ['GB'] _TESTS = [{ - 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', + 'url': 'https://www.itv.com/hub/liar/2a4547a0012', 'info_dict': { - 'id': '2a2936a0053', - 'ext': 'flv', - 'title': 'Home Movie', + 'id': '2a4547a0012', + 'ext': 'mp4', + 'title': 'Liar - Series 2 - Episode 6', + 'description': 'md5:d0f91536569dec79ea184f0a44cca089', + 'series': 'Liar', + 'season_number': 2, + 'episode_number': 6, }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, }, { @@ -62,220 +58,97 @@ class ITVIE(InfoExtractor): params = extract_attributes(self._search_regex( r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) - ns_map = { - 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/', - 'tem': 'http://tempuri.org/', - 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types', - 'com': 'http://schemas.itv.com/2009/05/Common', - } - for ns, full_ns in ns_map.items(): - compat_etree_register_namespace(ns, full_ns) - - def _add_ns(name): - return xpath_with_ns(name, ns_map) - - def _add_sub_element(element, name): - return etree.SubElement(element, _add_ns(name)) - - production_id = ( - params.get('data-video-autoplay-id') - or '%s#001' % ( - params.get('data-video-episode-id') - or video_id.replace('a', '/'))) - - req_env = etree.Element(_add_ns('soapenv:Envelope')) - _add_sub_element(req_env, 'soapenv:Header') - body = _add_sub_element(req_env, 'soapenv:Body') - get_playlist = _add_sub_element(body, ('tem:GetPlaylist')) - request = _add_sub_element(get_playlist, 'tem:request') - _add_sub_element(request, 'itv:ProductionId').text = production_id - _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper() - vodcrid = _add_sub_element(request, 'itv:Vodcrid') - _add_sub_element(vodcrid, 'com:Id') - _add_sub_element(request, 'itv:Partition') - user_info = _add_sub_element(get_playlist, 'tem:userInfo') - _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv' - _add_sub_element(user_info, 'itv:DM') - _add_sub_element(user_info, 'itv:RevenueScienceValue') - _add_sub_element(user_info, 'itv:SessionId') - _add_sub_element(user_info, 'itv:SsoToken') - _add_sub_element(user_info, 'itv:UserToken') - site_info = _add_sub_element(get_playlist, 'tem:siteInfo') - _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None' - _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV' - _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any' - _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO' - _add_sub_element(site_info, 'itv:Category') - _add_sub_element(site_info, 'itv:Platform').text = 'DotCom' - _add_sub_element(site_info, 'itv:Site').text = 'ItvCom' - device_info = _add_sub_element(get_playlist, 'tem:deviceInfo') - _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big' - player_info = _add_sub_element(get_playlist, 'tem:playerInfo') - _add_sub_element(player_info, 'itv:Version').text = '2' - + ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] + hmac = params['data-video-hmac'] headers = self.geo_verification_headers() headers.update({ - 'Content-Type': 'text/xml; charset=utf-8', - 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', + 'Accept': 'application/vnd.itv.vod.playlist.v2+json', + 'Content-Type': 'application/json', + 'hmac': hmac.upper(), }) + ios_playlist = self._download_json( + ios_playlist_url, video_id, data=json.dumps({ + 'user': { + 'itvUserId': '', + 'entitlements': [], + 'token': '' + }, + 'device': { + 'manufacturer': 'Safari', + 'model': '5', + 'os': { + 'name': 'Windows NT', + 'version': '6.1', + 'type': 'desktop' + } + }, + 'client': { + 'version': '4.1', + 'id': 'browser' + }, + 'variantAvailability': { + 'featureset': { + 'min': ['hls', 'aes', 'outband-webvtt'], + 'max': ['hls', 'aes', 'outband-webvtt'] + }, + 'platformTag': 'dotcom' + } + }).encode(), headers=headers) + video_data = ios_playlist['Playlist']['Video'] + ios_base_url = video_data.get('Base') - info = self._search_json_ld(webpage, video_id, default={}) formats = [] - subtitles = {} - - def extract_subtitle(sub_url): - ext = determine_ext(sub_url, 'ttml') - subtitles.setdefault('en', []).append({ - 'url': sub_url, - 'ext': 'ttml' if ext == 'xml' else ext, - }) - - resp_env = self._download_xml( - params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env), fatal=False) - if resp_env: - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) + for media_file in (video_data.get('MediaFiles') or []): + href = media_file.get('Href') + if not href: + continue + if ios_base_url: + href = ios_base_url + href + ext = determine_ext(href) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), + formats.append({ + 'url': href, }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] - - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) - - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) + self._sort_formats(formats) - ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') - hmac = params.get('data-video-hmac') - if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url): - headers = self.geo_verification_headers() - headers.update({ - 'Accept': 'application/vnd.itv.vod.playlist.v2+json', - 'Content-Type': 'application/json', - 'hmac': hmac.upper(), + subtitles = {} + subs = video_data.get('Subtitles') or [] + for sub in subs: + if not isinstance(sub, dict): + continue + href = url_or_none(sub.get('Href')) + if not href: + continue + subtitles.setdefault('en', []).append({ + 'url': href, + 'ext': determine_ext(href, 'vtt'), }) - ios_playlist = self._download_json( - ios_playlist_url, video_id, data=json.dumps({ - 'user': { - 'itvUserId': '', - 'entitlements': [], - 'token': '' - }, - 'device': { - 'manufacturer': 'Safari', - 'model': '5', - 'os': { - 'name': 'Windows NT', - 'version': '6.1', - 'type': 'desktop' - } - }, - 'client': { - 'version': '4.1', - 'id': 'browser' - }, - 'variantAvailability': { - 'featureset': { - 'min': ['hls', 'aes', 'outband-webvtt'], - 'max': ['hls', 'aes', 'outband-webvtt'] - }, - 'platformTag': 'dotcom' - } - }).encode(), headers=headers, fatal=False) - if ios_playlist: - video_data = ios_playlist.get('Playlist', {}).get('Video', {}) - ios_base_url = video_data.get('Base') - for media_file in video_data.get('MediaFiles', []): - href = media_file.get('Href') - if not href: - continue - if ios_base_url: - href = ios_base_url + href - ext = determine_ext(href) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': href, - }) - subs = video_data.get('Subtitles') - if isinstance(subs, list): - for sub in subs: - if not isinstance(sub, dict): - continue - href = url_or_none(sub.get('Href')) - if href: - extract_subtitle(href) - if not info.get('duration'): - info['duration'] = parse_duration(video_data.get('Duration')) - - self._sort_formats(formats) - info.update({ + info = self._search_json_ld(webpage, video_id, default={}) + if not info: + json_ld = self._parse_json(self._search_regex( + JSON_LD_RE, webpage, 'JSON-LD', '{}', + group='json_ld'), video_id, fatal=False) + if json_ld and json_ld.get('@type') == 'BreadcrumbList': + for ile in (json_ld.get('itemListElement:') or []): + item = ile.get('item:') or {} + if item.get('@type') == 'TVEpisode': + item['@context'] = 'http://schema.org' + info = self._json_ld(item, video_id, fatal=False) or {} + break + + return merge_dicts({ 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), 'formats': formats, 'subtitles': subtitles, - }) - - webpage_info = self._search_json_ld(webpage, video_id, default={}) - if not webpage_info.get('title'): - webpage_info['title'] = self._html_search_regex( - r'(?s)]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or webpage_info['episode'] - - return merge_dicts(info, webpage_info) + 'duration': parse_duration(video_data.get('Duration')), + 'description': strip_or_none(get_element_by_class('episode-info__synopsis', webpage)), + }, info) class ITVBTCCIE(InfoExtractor): diff --git a/youtube_dlc/extractor/ruutu.py b/youtube_dlc/extractor/ruutu.py index f984040aa..c50cd3ecd 100644 --- a/youtube_dlc/extractor/ruutu.py +++ b/youtube_dlc/extractor/ruutu.py @@ -6,14 +6,24 @@ from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, ExtractorError, + find_xpath_attr, int_or_none, + unified_strdate, + url_or_none, xpath_attr, xpath_text, ) class RuutuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/| + static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid= + ) + (?P\d+) + ''' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', @@ -71,15 +81,53 @@ class RuutuIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, }, - 'expected_warnings': ['HTTP Error 502: Bad Gateway'], - } + 'expected_warnings': [ + 'HTTP Error 502: Bad Gateway', + 'Failed to download m3u8 information', + ], + }, + { + 'url': 'http://www.supla.fi/audio/2231370', + 'only_matching': True, + }, + { + 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', + 'only_matching': True, + }, + { + # episode + 'url': 'https://www.ruutu.fi/video/3401964', + 'info_dict': { + 'id': '3401964', + 'ext': 'mp4', + 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17', + 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2582, + 'age_limit': 12, + 'upload_date': '20190508', + 'series': 'Temptation Island Suomi', + 'season_number': 5, + 'episode_number': 17, + 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'], + }, + 'params': { + 'skip_download': True, + }, + }, + { + # premium + 'url': 'https://www.ruutu.fi/video/3618715', + 'only_matching': True, + }, ] + _API_BASE = 'https://gatling.nelonenmedia.fi' def _real_extract(self, url): video_id = self._match_id(url) video_xml = self._download_xml( - 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + '%s/media-xml-cache' % self._API_BASE, video_id, query={'id': video_id}) formats = [] @@ -96,9 +144,18 @@ class RuutuIE(InfoExtractor): continue processed_urls.append(video_url) ext = determine_ext(video_url) + auth_video_url = url_or_none(self._download_webpage( + '%s/auth/access/v2' % self._API_BASE, video_id, + note='Downloading authenticated %s stream URL' % ext, + fatal=False, query={'stream': video_url})) + if auth_video_url: + processed_urls.append(auth_video_url) + video_url = auth_video_url if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) @@ -136,18 +193,35 @@ class RuutuIE(InfoExtractor): extract_formats(video_xml.find('./Clip')) - drm = xpath_text(video_xml, './Clip/DRM', default=None) - if not formats and drm: - raise ExtractorError('This video is DRM protected.', expected=True) + def pv(name): + node = find_xpath_attr( + video_xml, './Clip/PassthroughVariables/variable', 'name', name) + if node is not None: + return node.get('value') + + if not formats: + drm = xpath_text(video_xml, './Clip/DRM', default=None) + if drm: + raise ExtractorError('This video is DRM protected.', expected=True) + ns_st_cds = pv('ns_st_cds') + if ns_st_cds != 'free': + raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) self._sort_formats(formats) + themes = pv('themes') + return { 'id': video_id, 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), - 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')), 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'upload_date': unified_strdate(pv('date_start')), + 'series': pv('series_name'), + 'season_number': int_or_none(pv('season_number')), + 'episode_number': int_or_none(pv('episode_number')), + 'categories': themes.split(',') if themes else [], 'formats': formats, } diff --git a/youtube_dlc/extractor/wdr.py b/youtube_dlc/extractor/wdr.py index 44d4a13ca..5cb5924f8 100644 --- a/youtube_dlc/extractor/wdr.py +++ b/youtube_dlc/extractor/wdr.py @@ -17,6 +17,7 @@ from ..utils import ( unified_strdate, update_url_query, urlhandle_detect_ext, + url_or_none, ) @@ -42,15 +43,15 @@ class WDRIE(InfoExtractor): is_live = metadata.get('mediaType') == 'live' tracker_data = metadata['trackerData'] + title = tracker_data['trackerClipTitle'] media_resource = metadata['mediaResource'] formats = [] - subtitles = {} # check if the metadata contains a direct URL to a file - for kind, media_resource in media_resource.items(): + for kind, media in media_resource.items(): if kind == 'captionsHash': - for ext, url in media_resource.items(): + for ext, url in media.items(): subtitles.setdefault('de', []).append({ 'url': url, 'ext': ext, @@ -59,8 +60,10 @@ class WDRIE(InfoExtractor): if kind not in ('dflt', 'alt'): continue + if not isinstance(media, dict): + continue - for tag_name, medium_url in media_resource.items(): + for tag_name, medium_url in media.items(): if tag_name not in ('videoURL', 'audioURL'): continue @@ -90,7 +93,23 @@ class WDRIE(InfoExtractor): self._sort_formats(formats) - title = tracker_data['trackerClipTitle'] + subtitles = {} + caption_url = media_resource.get('captionURL') + if caption_url: + subtitles['de'] = [{ + 'url': caption_url, + 'ext': 'ttml', + }] + captions_hash = media_resource.get('captionsHash') + if isinstance(captions_hash, dict): + for ext, format_url in captions_hash.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + subtitles.setdefault('de', []).append({ + 'url': format_url, + 'ext': determine_ext(format_url, None) or ext, + }) return { 'id': tracker_data.get('trackerClipId', video_id), @@ -106,7 +125,7 @@ class WDRIE(InfoExtractor): class WDRPageIE(InfoExtractor): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { @@ -213,7 +232,11 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', 'only_matching': True, - } + }, + { + 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', + 'only_matching': True, + }, ] def _real_extract(self, url):