diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py
index a5c8e1df6..32766ea61 100644
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@@ -2505,7 +2505,7 @@ class YoutubeDL(object):
thumb_ext = determine_ext(t['url'], 'jpg')
suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
- t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+ t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext'))
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
self.to_screen('[%s] %s: Thumbnail %sis already present' %
diff --git a/youtube_dlc/extractor/facebook.py b/youtube_dlc/extractor/facebook.py
index 610d66745..147deccc9 100644
--- a/youtube_dlc/extractor/facebook.py
+++ b/youtube_dlc/extractor/facebook.py
@@ -16,14 +16,17 @@ from ..utils import (
clean_html,
error_to_compat_str,
ExtractorError,
+ float_or_none,
get_element_by_id,
int_or_none,
js_to_json,
limit_length,
parse_count,
+ qualities,
sanitized_Request,
try_get,
urlencode_postdata,
+ urljoin,
)
@@ -39,7 +42,8 @@ class FacebookIE(InfoExtractor):
photo\.php|
video\.php|
video/embed|
- story\.php
+ story\.php|
+ watch(?:/live)?/?
)\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/|
@@ -54,8 +58,6 @@ class FacebookIE(InfoExtractor):
_NETRC_MACHINE = 'facebook'
IE_NAME = 'facebook'
- _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
-
_VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
_VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
@@ -72,6 +74,7 @@ class FacebookIE(InfoExtractor):
},
'skip': 'Requires logging in',
}, {
+ # data.video
'url': 'https://www.facebook.com/video.php?v=274175099429670',
'info_dict': {
'id': '274175099429670',
@@ -133,6 +136,7 @@ class FacebookIE(InfoExtractor):
},
}, {
# have 1080P, but only up to 720p in swf params
+ # data.video.story.attachments[].media
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
'md5': '9571fae53d4165bbbadb17a94651dcdc',
'info_dict': {
@@ -147,6 +151,7 @@ class FacebookIE(InfoExtractor):
},
}, {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
+ # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
'info_dict': {
'id': '1417995061575415',
@@ -174,6 +179,7 @@ class FacebookIE(InfoExtractor):
'skip_download': True,
},
}, {
+ # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
'info_dict': {
'id': '1396382447100162',
@@ -193,18 +199,23 @@ class FacebookIE(InfoExtractor):
'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
'only_matching': True,
}, {
+ # data.mediaset.currMedia.edges
'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
'only_matching': True,
}, {
+ # data.video.story.attachments[].media
'url': 'facebook:544765982287235',
'only_matching': True,
}, {
+ # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
'only_matching': True,
}, {
+ # data.video.creation_story.attachments[].media
'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/',
'only_matching': True,
}, {
+ # data.video
'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',
'only_matching': True,
}, {
@@ -212,6 +223,7 @@ class FacebookIE(InfoExtractor):
'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
'only_matching': True,
}, {
+ # data.video
'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/',
'info_dict': {
'id': '359649331226507',
@@ -222,7 +234,54 @@ class FacebookIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
+ 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/',
+ 'info_dict': {
+ 'id': '106560053808006',
+ },
+ 'playlist_count': 2,
+ }, {
+ # data.video.story.attachments[].media
+ 'url': 'https://www.facebook.com/watch/?v=647537299265662',
+ 'only_matching': True,
+ }, {
+ # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
+ 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271',
+ 'info_dict': {
+ 'id': '10157667649866271',
+ },
+ 'playlist_count': 3,
+ }, {
+ # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media
+ 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330',
+ 'info_dict': {
+ 'id': '117576630041613',
+ 'ext': 'mp4',
+ # TODO: title can be extracted from video page
+ 'title': 'Facebook video #117576630041613',
+ 'uploader_id': '189393014416438',
+ 'upload_date': '20201123',
+ 'timestamp': 1606162592,
+ },
+ 'skip': 'Requires logging in',
+ }, {
+ # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media
+ 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/',
+ 'info_dict': {
+ 'id': '211567722618337',
+ 'ext': 'mp4',
+ 'title': 'Facebook video #211567722618337',
+ 'uploader_id': '127875227654254',
+ 'upload_date': '20161122',
+ 'timestamp': 1479793574,
+ },
+ }, {
+ # data.video.creation_story.attachments[].media
+ 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
+ 'only_matching': True,
}]
+ _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
@staticmethod
def _extract_urls(webpage):
@@ -305,23 +364,24 @@ class FacebookIE(InfoExtractor):
def _real_initialize(self):
self._login()
- def _extract_from_url(self, url, video_id, fatal_if_no_video=True):
- req = sanitized_Request(url)
- req.add_header('User-Agent', self._CHROME_USER_AGENT)
- webpage = self._download_webpage(req, video_id)
+ def _extract_from_url(self, url, video_id):
+ webpage = self._download_webpage(
+ url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
video_data = None
def extract_video_data(instances):
+ video_data = []
for item in instances:
- if item[1][0] == 'VideoConfig':
+ if try_get(item, lambda x: x[1][0]) == 'VideoConfig':
video_item = item[2][0]
if video_item.get('video_id'):
- return video_item['videoData']
+ video_data.append(video_item['videoData'])
+ return video_data
server_js_data = self._parse_json(self._search_regex(
- r'handleServerJS\(({.+})(?:\);|,")', webpage,
- 'server js data', default='{}'), video_id, fatal=False)
+ [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'],
+ webpage, 'server js data', default='{}'), video_id, fatal=False)
if server_js_data:
video_data = extract_video_data(server_js_data.get('instances', []))
@@ -331,17 +391,111 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or [])
+ def extract_dash_manifest(video, formats):
+ dash_manifest = video.get('dash_manifest')
+ if dash_manifest:
+ formats.extend(self._parse_mpd_formats(
+ compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+
+ def process_formats(formats):
+ # Downloads with browser's User-Agent are rate limited. Working around
+ # with non-browser User-Agent.
+ for f in formats:
+ f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
+
+ self._sort_formats(formats)
+
if not video_data:
- server_js_data = self._parse_json(
- self._search_regex(
- r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)',
- webpage, 'js data', default='{}'),
- video_id, transform_source=js_to_json, fatal=False)
+ server_js_data = self._parse_json(self._search_regex([
+ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
+ r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX
+ ], webpage, 'js data', default='{}'), video_id, js_to_json, False)
video_data = extract_from_jsmods_instances(server_js_data)
if not video_data:
- if not fatal_if_no_video:
- return webpage, False
+ graphql_data = self._parse_json(self._search_regex(
+ r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);',
+ webpage, 'graphql data', default='{}'), video_id, fatal=False) or {}
+ for require in (graphql_data.get('require') or []):
+ if require[0] == 'RelayPrefetchedStreamCache':
+ entries = []
+
+ def parse_graphql_video(video):
+ formats = []
+ q = qualities(['sd', 'hd'])
+ for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
+ playable_url = video.get('playable_url' + suffix)
+ if not playable_url:
+ continue
+ formats.append({
+ 'format_id': format_id,
+ 'quality': q(format_id),
+ 'url': playable_url,
+ })
+ extract_dash_manifest(video, formats)
+ process_formats(formats)
+ v_id = video.get('videoId') or video.get('id') or video_id
+ info = {
+ 'id': v_id,
+ 'formats': formats,
+ 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
+ 'uploader_id': try_get(video, lambda x: x['owner']['id']),
+ 'timestamp': int_or_none(video.get('publish_time')),
+ 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
+ }
+ description = try_get(video, lambda x: x['savable_description']['text'])
+ title = video.get('name')
+ if title:
+ info.update({
+ 'title': title,
+ 'description': description,
+ })
+ else:
+ info['title'] = description or 'Facebook video #%s' % v_id
+ entries.append(info)
+
+ def parse_attachment(attachment, key='media'):
+ media = attachment.get(key) or {}
+ if media.get('__typename') == 'Video':
+ return parse_graphql_video(media)
+
+ data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
+
+ nodes = data.get('nodes') or []
+ node = data.get('node') or {}
+ if not nodes and node:
+ nodes.append(node)
+ for node in nodes:
+ story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
+ attachments = try_get(story, [
+ lambda x: x['attached_story']['attachments'],
+ lambda x: x['attachments']
+ ], list) or []
+ for attachment in attachments:
+ attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
+ ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
+ for n in ns:
+ parse_attachment(n)
+ parse_attachment(attachment)
+
+ edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
+ for edge in edges:
+ parse_attachment(edge, key='node')
+
+ video = data.get('video') or {}
+ if video:
+ attachments = try_get(video, [
+ lambda x: x['story']['attachments'],
+ lambda x: x['creation_story']['attachments']
+ ], list) or []
+ for attachment in attachments:
+ parse_attachment(attachment)
+ if not entries:
+ parse_graphql_video(video)
+
+ return self.playlist_result(entries, video_id)
+
+ if not video_data:
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage)
if m_msg is not None:
raise ExtractorError(
@@ -379,8 +533,19 @@ class FacebookIE(InfoExtractor):
if not video_data:
raise ExtractorError('Cannot parse data')
- subtitles = {}
+ if len(video_data) > 1:
+ entries = []
+ for v in video_data:
+ video_url = v[0].get('video_url')
+ if not video_url:
+ continue
+ entries.append(self.url_result(urljoin(
+ url, video_url), self.ie_key(), v[0].get('video_id')))
+ return self.playlist_result(entries, video_id)
+ video_data = video_data[0]
+
formats = []
+ subtitles = {}
for f in video_data:
format_id = f['stream_type']
if f and isinstance(f, dict):
@@ -399,22 +564,14 @@ class FacebookIE(InfoExtractor):
'url': src,
'preference': preference,
})
- dash_manifest = f[0].get('dash_manifest')
- if dash_manifest:
- formats.extend(self._parse_mpd_formats(
- compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+ extract_dash_manifest(f[0], formats)
subtitles_src = f[0].get('subtitles_src')
if subtitles_src:
subtitles.setdefault('en', []).append({'url': subtitles_src})
if not formats:
raise ExtractorError('Cannot find video formats')
- # Downloads with browser's User-Agent are rate limited. Working around
- # with non-browser User-Agent.
- for f in formats:
- f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
-
- self._sort_formats(formats)
+ process_formats(formats)
video_title = self._html_search_regex(
r']*class="uiHeaderTitle"[^>]*>([^<]*)
', webpage,
@@ -454,35 +611,13 @@ class FacebookIE(InfoExtractor):
'subtitles': subtitles,
}
- return webpage, info_dict
+ return info_dict
def _real_extract(self, url):
video_id = self._match_id(url)
real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
- webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False)
-
- if info_dict:
- return info_dict
-
- if '/posts/' in url:
- video_id_json = self._search_regex(
- r'(["\'])video_ids\1\s*:\s*(?P\[.+?\])', webpage, 'video ids', group='ids',
- default='')
- if video_id_json:
- entries = [
- self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
- for vid in self._parse_json(video_id_json, video_id)]
- return self.playlist_result(entries, video_id)
-
- # Single Video?
- video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id')
- return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
- else:
- _, info_dict = self._extract_from_url(
- self._VIDEO_PAGE_TEMPLATE % video_id,
- video_id, fatal_if_no_video=True)
- return info_dict
+ return self._extract_from_url(real_url, video_id)
class FacebookPluginsVideoIE(InfoExtractor):
diff --git a/youtube_dlc/extractor/itv.py b/youtube_dlc/extractor/itv.py
index 20144cd82..cef38b0fa 100644
--- a/youtube_dlc/extractor/itv.py
+++ b/youtube_dlc/extractor/itv.py
@@ -1,30 +1,22 @@
# coding: utf-8
from __future__ import unicode_literals
-import uuid
-import xml.etree.ElementTree as etree
import json
import re
from .common import InfoExtractor
from .brightcove import BrightcoveNewIE
-from ..compat import (
- compat_str,
- compat_etree_register_namespace,
-)
from ..utils import (
determine_ext,
- ExtractorError,
extract_attributes,
- int_or_none,
+ get_element_by_class,
+ JSON_LD_RE,
merge_dicts,
parse_duration,
smuggle_url,
+ strip_or_none,
try_get,
url_or_none,
- xpath_with_ns,
- xpath_element,
- xpath_text,
)
@@ -32,14 +24,18 @@ class ITVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)'
_GEO_COUNTRIES = ['GB']
_TESTS = [{
- 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053',
+ 'url': 'https://www.itv.com/hub/liar/2a4547a0012',
'info_dict': {
- 'id': '2a2936a0053',
- 'ext': 'flv',
- 'title': 'Home Movie',
+ 'id': '2a4547a0012',
+ 'ext': 'mp4',
+ 'title': 'Liar - Series 2 - Episode 6',
+ 'description': 'md5:d0f91536569dec79ea184f0a44cca089',
+ 'series': 'Liar',
+ 'season_number': 2,
+ 'episode_number': 6,
},
'params': {
- # rtmp download
+ # m3u8 download
'skip_download': True,
},
}, {
@@ -62,220 +58,97 @@ class ITVIE(InfoExtractor):
params = extract_attributes(self._search_regex(
r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
- ns_map = {
- 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/',
- 'tem': 'http://tempuri.org/',
- 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types',
- 'com': 'http://schemas.itv.com/2009/05/Common',
- }
- for ns, full_ns in ns_map.items():
- compat_etree_register_namespace(ns, full_ns)
-
- def _add_ns(name):
- return xpath_with_ns(name, ns_map)
-
- def _add_sub_element(element, name):
- return etree.SubElement(element, _add_ns(name))
-
- production_id = (
- params.get('data-video-autoplay-id')
- or '%s#001' % (
- params.get('data-video-episode-id')
- or video_id.replace('a', '/')))
-
- req_env = etree.Element(_add_ns('soapenv:Envelope'))
- _add_sub_element(req_env, 'soapenv:Header')
- body = _add_sub_element(req_env, 'soapenv:Body')
- get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
- request = _add_sub_element(get_playlist, 'tem:request')
- _add_sub_element(request, 'itv:ProductionId').text = production_id
- _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
- vodcrid = _add_sub_element(request, 'itv:Vodcrid')
- _add_sub_element(vodcrid, 'com:Id')
- _add_sub_element(request, 'itv:Partition')
- user_info = _add_sub_element(get_playlist, 'tem:userInfo')
- _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv'
- _add_sub_element(user_info, 'itv:DM')
- _add_sub_element(user_info, 'itv:RevenueScienceValue')
- _add_sub_element(user_info, 'itv:SessionId')
- _add_sub_element(user_info, 'itv:SsoToken')
- _add_sub_element(user_info, 'itv:UserToken')
- site_info = _add_sub_element(get_playlist, 'tem:siteInfo')
- _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None'
- _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV'
- _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any'
- _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO'
- _add_sub_element(site_info, 'itv:Category')
- _add_sub_element(site_info, 'itv:Platform').text = 'DotCom'
- _add_sub_element(site_info, 'itv:Site').text = 'ItvCom'
- device_info = _add_sub_element(get_playlist, 'tem:deviceInfo')
- _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big'
- player_info = _add_sub_element(get_playlist, 'tem:playerInfo')
- _add_sub_element(player_info, 'itv:Version').text = '2'
-
+ ios_playlist_url = params.get('data-video-playlist') or params['data-video-id']
+ hmac = params['data-video-hmac']
headers = self.geo_verification_headers()
headers.update({
- 'Content-Type': 'text/xml; charset=utf-8',
- 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist',
+ 'Accept': 'application/vnd.itv.vod.playlist.v2+json',
+ 'Content-Type': 'application/json',
+ 'hmac': hmac.upper(),
})
+ ios_playlist = self._download_json(
+ ios_playlist_url, video_id, data=json.dumps({
+ 'user': {
+ 'itvUserId': '',
+ 'entitlements': [],
+ 'token': ''
+ },
+ 'device': {
+ 'manufacturer': 'Safari',
+ 'model': '5',
+ 'os': {
+ 'name': 'Windows NT',
+ 'version': '6.1',
+ 'type': 'desktop'
+ }
+ },
+ 'client': {
+ 'version': '4.1',
+ 'id': 'browser'
+ },
+ 'variantAvailability': {
+ 'featureset': {
+ 'min': ['hls', 'aes', 'outband-webvtt'],
+ 'max': ['hls', 'aes', 'outband-webvtt']
+ },
+ 'platformTag': 'dotcom'
+ }
+ }).encode(), headers=headers)
+ video_data = ios_playlist['Playlist']['Video']
+ ios_base_url = video_data.get('Base')
- info = self._search_json_ld(webpage, video_id, default={})
formats = []
- subtitles = {}
-
- def extract_subtitle(sub_url):
- ext = determine_ext(sub_url, 'ttml')
- subtitles.setdefault('en', []).append({
- 'url': sub_url,
- 'ext': 'ttml' if ext == 'xml' else ext,
- })
-
- resp_env = self._download_xml(
- params['data-playlist-url'], video_id,
- headers=headers, data=etree.tostring(req_env), fatal=False)
- if resp_env:
- playlist = xpath_element(resp_env, './/Playlist')
- if playlist is None:
- fault_code = xpath_text(resp_env, './/faultcode')
- fault_string = xpath_text(resp_env, './/faultstring')
- if fault_code == 'InvalidGeoRegion':
- self.raise_geo_restricted(
- msg=fault_string, countries=self._GEO_COUNTRIES)
- elif fault_code not in (
- 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
- info.update({
- 'title': self._og_search_title(webpage),
- 'episode_title': params.get('data-video-episode'),
- 'series': params.get('data-video-title'),
- })
+ for media_file in (video_data.get('MediaFiles') or []):
+ href = media_file.get('Href')
+ if not href:
+ continue
+ if ios_base_url:
+ href = ios_base_url + href
+ ext = determine_ext(href)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ href, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
else:
- title = xpath_text(playlist, 'EpisodeTitle', default=None)
- info.update({
- 'title': title,
- 'episode_title': title,
- 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
- 'series': xpath_text(playlist, 'ProgrammeTitle'),
- 'duration': parse_duration(xpath_text(playlist, 'Duration')),
+ formats.append({
+ 'url': href,
})
- video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
- media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
- rtmp_url = media_files.attrib['base']
-
- for media_file in media_files.findall('MediaFile'):
- play_path = xpath_text(media_file, 'URL')
- if not play_path:
- continue
- tbr = int_or_none(media_file.get('bitrate'), 1000)
- f = {
- 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
- 'play_path': play_path,
- # Providing this swfVfy allows to avoid truncated downloads
- 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
- 'page_url': url,
- 'tbr': tbr,
- 'ext': 'flv',
- }
- app = self._search_regex(
- 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
- if app:
- f.update({
- 'url': rtmp_url.split('?', 1)[0],
- 'app': app,
- })
- else:
- f['url'] = rtmp_url
- formats.append(f)
-
- for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
- if caption_url.text:
- extract_subtitle(caption_url.text)
+ self._sort_formats(formats)
- ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
- hmac = params.get('data-video-hmac')
- if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url):
- headers = self.geo_verification_headers()
- headers.update({
- 'Accept': 'application/vnd.itv.vod.playlist.v2+json',
- 'Content-Type': 'application/json',
- 'hmac': hmac.upper(),
+ subtitles = {}
+ subs = video_data.get('Subtitles') or []
+ for sub in subs:
+ if not isinstance(sub, dict):
+ continue
+ href = url_or_none(sub.get('Href'))
+ if not href:
+ continue
+ subtitles.setdefault('en', []).append({
+ 'url': href,
+ 'ext': determine_ext(href, 'vtt'),
})
- ios_playlist = self._download_json(
- ios_playlist_url, video_id, data=json.dumps({
- 'user': {
- 'itvUserId': '',
- 'entitlements': [],
- 'token': ''
- },
- 'device': {
- 'manufacturer': 'Safari',
- 'model': '5',
- 'os': {
- 'name': 'Windows NT',
- 'version': '6.1',
- 'type': 'desktop'
- }
- },
- 'client': {
- 'version': '4.1',
- 'id': 'browser'
- },
- 'variantAvailability': {
- 'featureset': {
- 'min': ['hls', 'aes', 'outband-webvtt'],
- 'max': ['hls', 'aes', 'outband-webvtt']
- },
- 'platformTag': 'dotcom'
- }
- }).encode(), headers=headers, fatal=False)
- if ios_playlist:
- video_data = ios_playlist.get('Playlist', {}).get('Video', {})
- ios_base_url = video_data.get('Base')
- for media_file in video_data.get('MediaFiles', []):
- href = media_file.get('Href')
- if not href:
- continue
- if ios_base_url:
- href = ios_base_url + href
- ext = determine_ext(href)
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- href, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
- else:
- formats.append({
- 'url': href,
- })
- subs = video_data.get('Subtitles')
- if isinstance(subs, list):
- for sub in subs:
- if not isinstance(sub, dict):
- continue
- href = url_or_none(sub.get('Href'))
- if href:
- extract_subtitle(href)
- if not info.get('duration'):
- info['duration'] = parse_duration(video_data.get('Duration'))
-
- self._sort_formats(formats)
- info.update({
+ info = self._search_json_ld(webpage, video_id, default={})
+ if not info:
+ json_ld = self._parse_json(self._search_regex(
+ JSON_LD_RE, webpage, 'JSON-LD', '{}',
+ group='json_ld'), video_id, fatal=False)
+ if json_ld and json_ld.get('@type') == 'BreadcrumbList':
+ for ile in (json_ld.get('itemListElement:') or []):
+ item = ile.get('item:') or {}
+ if item.get('@type') == 'TVEpisode':
+ item['@context'] = 'http://schema.org'
+ info = self._json_ld(item, video_id, fatal=False) or {}
+ break
+
+ return merge_dicts({
'id': video_id,
+ 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
'formats': formats,
'subtitles': subtitles,
- })
-
- webpage_info = self._search_json_ld(webpage, video_id, default={})
- if not webpage_info.get('title'):
- webpage_info['title'] = self._html_search_regex(
- r'(?s)]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
- webpage, 'title', default=None) or self._og_search_title(
- webpage, default=None) or self._html_search_meta(
- 'twitter:title', webpage, 'title',
- default=None) or webpage_info['episode']
-
- return merge_dicts(info, webpage_info)
+ 'duration': parse_duration(video_data.get('Duration')),
+ 'description': strip_or_none(get_element_by_class('episode-info__synopsis', webpage)),
+ }, info)
class ITVBTCCIE(InfoExtractor):
diff --git a/youtube_dlc/extractor/ruutu.py b/youtube_dlc/extractor/ruutu.py
index f984040aa..c50cd3ecd 100644
--- a/youtube_dlc/extractor/ruutu.py
+++ b/youtube_dlc/extractor/ruutu.py
@@ -6,14 +6,24 @@ from ..compat import compat_urllib_parse_urlparse
from ..utils import (
determine_ext,
ExtractorError,
+ find_xpath_attr,
int_or_none,
+ unified_strdate,
+ url_or_none,
xpath_attr,
xpath_text,
)
class RuutuIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/|
+ static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid=
+ )
+ (?P\d+)
+ '''
_TESTS = [
{
'url': 'http://www.ruutu.fi/video/2058907',
@@ -71,15 +81,53 @@ class RuutuIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$',
'age_limit': 0,
},
- 'expected_warnings': ['HTTP Error 502: Bad Gateway'],
- }
+ 'expected_warnings': [
+ 'HTTP Error 502: Bad Gateway',
+ 'Failed to download m3u8 information',
+ ],
+ },
+ {
+ 'url': 'http://www.supla.fi/audio/2231370',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790',
+ 'only_matching': True,
+ },
+ {
+ # episode
+ 'url': 'https://www.ruutu.fi/video/3401964',
+ 'info_dict': {
+ 'id': '3401964',
+ 'ext': 'mp4',
+ 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17',
+ 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 2582,
+ 'age_limit': 12,
+ 'upload_date': '20190508',
+ 'series': 'Temptation Island Suomi',
+ 'season_number': 5,
+ 'episode_number': 17,
+ 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'],
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # premium
+ 'url': 'https://www.ruutu.fi/video/3618715',
+ 'only_matching': True,
+ },
]
+ _API_BASE = 'https://gatling.nelonenmedia.fi'
def _real_extract(self, url):
video_id = self._match_id(url)
video_xml = self._download_xml(
- 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id,
+ '%s/media-xml-cache' % self._API_BASE, video_id,
query={'id': video_id})
formats = []
@@ -96,9 +144,18 @@ class RuutuIE(InfoExtractor):
continue
processed_urls.append(video_url)
ext = determine_ext(video_url)
+ auth_video_url = url_or_none(self._download_webpage(
+ '%s/auth/access/v2' % self._API_BASE, video_id,
+ note='Downloading authenticated %s stream URL' % ext,
+ fatal=False, query={'stream': video_url}))
+ if auth_video_url:
+ processed_urls.append(auth_video_url)
+ video_url = auth_video_url
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ video_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls',
+ fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id='hds', fatal=False))
@@ -136,18 +193,35 @@ class RuutuIE(InfoExtractor):
extract_formats(video_xml.find('./Clip'))
- drm = xpath_text(video_xml, './Clip/DRM', default=None)
- if not formats and drm:
- raise ExtractorError('This video is DRM protected.', expected=True)
+ def pv(name):
+ node = find_xpath_attr(
+ video_xml, './Clip/PassthroughVariables/variable', 'name', name)
+ if node is not None:
+ return node.get('value')
+
+ if not formats:
+ drm = xpath_text(video_xml, './Clip/DRM', default=None)
+ if drm:
+ raise ExtractorError('This video is DRM protected.', expected=True)
+ ns_st_cds = pv('ns_st_cds')
+ if ns_st_cds != 'free':
+ raise ExtractorError('This video is %s.' % ns_st_cds, expected=True)
self._sort_formats(formats)
+ themes = pv('themes')
+
return {
'id': video_id,
'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True),
'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'),
'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'),
- 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+ 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')),
'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+ 'upload_date': unified_strdate(pv('date_start')),
+ 'series': pv('series_name'),
+ 'season_number': int_or_none(pv('season_number')),
+ 'episode_number': int_or_none(pv('episode_number')),
+ 'categories': themes.split(',') if themes else [],
'formats': formats,
}
diff --git a/youtube_dlc/extractor/wdr.py b/youtube_dlc/extractor/wdr.py
index 44d4a13ca..5cb5924f8 100644
--- a/youtube_dlc/extractor/wdr.py
+++ b/youtube_dlc/extractor/wdr.py
@@ -17,6 +17,7 @@ from ..utils import (
unified_strdate,
update_url_query,
urlhandle_detect_ext,
+ url_or_none,
)
@@ -42,15 +43,15 @@ class WDRIE(InfoExtractor):
is_live = metadata.get('mediaType') == 'live'
tracker_data = metadata['trackerData']
+ title = tracker_data['trackerClipTitle']
media_resource = metadata['mediaResource']
formats = []
- subtitles = {}
# check if the metadata contains a direct URL to a file
- for kind, media_resource in media_resource.items():
+ for kind, media in media_resource.items():
if kind == 'captionsHash':
- for ext, url in media_resource.items():
+ for ext, url in media.items():
subtitles.setdefault('de', []).append({
'url': url,
'ext': ext,
@@ -59,8 +60,10 @@ class WDRIE(InfoExtractor):
if kind not in ('dflt', 'alt'):
continue
+ if not isinstance(media, dict):
+ continue
- for tag_name, medium_url in media_resource.items():
+ for tag_name, medium_url in media.items():
if tag_name not in ('videoURL', 'audioURL'):
continue
@@ -90,7 +93,23 @@ class WDRIE(InfoExtractor):
self._sort_formats(formats)
- title = tracker_data['trackerClipTitle']
+ subtitles = {}
+ caption_url = media_resource.get('captionURL')
+ if caption_url:
+ subtitles['de'] = [{
+ 'url': caption_url,
+ 'ext': 'ttml',
+ }]
+ captions_hash = media_resource.get('captionsHash')
+ if isinstance(captions_hash, dict):
+ for ext, format_url in captions_hash.items():
+ format_url = url_or_none(format_url)
+ if not format_url:
+ continue
+ subtitles.setdefault('de', []).append({
+ 'url': format_url,
+ 'ext': determine_ext(format_url, None) or ext,
+ })
return {
'id': tracker_data.get('trackerClipId', video_id),
@@ -106,7 +125,7 @@ class WDRIE(InfoExtractor):
class WDRPageIE(InfoExtractor):
_CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
_PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P[^/]+)\.html'
- _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
+ _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
_TESTS = [
{
@@ -213,7 +232,11 @@ class WDRPageIE(InfoExtractor):
{
'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html',
'only_matching': True,
- }
+ },
+ {
+ 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):