Update to release 2020.12.12

pull/280/head
pukkandan 5 years ago
parent ff695879bc
commit b1ef860624

@ -2505,7 +2505,7 @@ class YoutubeDL(object):
thumb_ext = determine_ext(t['url'], 'jpg') thumb_ext = determine_ext(t['url'], 'jpg')
suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext'))
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
self.to_screen('[%s] %s: Thumbnail %sis already present' % self.to_screen('[%s] %s: Thumbnail %sis already present' %

@ -16,14 +16,17 @@ from ..utils import (
clean_html, clean_html,
error_to_compat_str, error_to_compat_str,
ExtractorError, ExtractorError,
float_or_none,
get_element_by_id, get_element_by_id,
int_or_none, int_or_none,
js_to_json, js_to_json,
limit_length, limit_length,
parse_count, parse_count,
qualities,
sanitized_Request, sanitized_Request,
try_get, try_get,
urlencode_postdata, urlencode_postdata,
urljoin,
) )
@ -39,7 +42,8 @@ class FacebookIE(InfoExtractor):
photo\.php| photo\.php|
video\.php| video\.php|
video/embed| video/embed|
story\.php story\.php|
watch(?:/live)?/?
)\?(?:.*?)(?:v|video_id|story_fbid)=| )\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?| [^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/| [^/]+/posts/|
@ -54,8 +58,6 @@ class FacebookIE(InfoExtractor):
_NETRC_MACHINE = 'facebook' _NETRC_MACHINE = 'facebook'
IE_NAME = 'facebook' IE_NAME = 'facebook'
_CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
_VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
_VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
@ -72,6 +74,7 @@ class FacebookIE(InfoExtractor):
}, },
'skip': 'Requires logging in', 'skip': 'Requires logging in',
}, { }, {
# data.video
'url': 'https://www.facebook.com/video.php?v=274175099429670', 'url': 'https://www.facebook.com/video.php?v=274175099429670',
'info_dict': { 'info_dict': {
'id': '274175099429670', 'id': '274175099429670',
@ -133,6 +136,7 @@ class FacebookIE(InfoExtractor):
}, },
}, { }, {
# have 1080P, but only up to 720p in swf params # have 1080P, but only up to 720p in swf params
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
'md5': '9571fae53d4165bbbadb17a94651dcdc', 'md5': '9571fae53d4165bbbadb17a94651dcdc',
'info_dict': { 'info_dict': {
@ -147,6 +151,7 @@ class FacebookIE(InfoExtractor):
}, },
}, { }, {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
'info_dict': { 'info_dict': {
'id': '1417995061575415', 'id': '1417995061575415',
@ -174,6 +179,7 @@ class FacebookIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
'info_dict': { 'info_dict': {
'id': '1396382447100162', 'id': '1396382447100162',
@ -193,18 +199,23 @@ class FacebookIE(InfoExtractor):
'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
'only_matching': True, 'only_matching': True,
}, { }, {
# data.mediaset.currMedia.edges
'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
'only_matching': True, 'only_matching': True,
}, { }, {
# data.video.story.attachments[].media
'url': 'facebook:544765982287235', 'url': 'facebook:544765982287235',
'only_matching': True, 'only_matching': True,
}, { }, {
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
'only_matching': True, 'only_matching': True,
}, { }, {
# data.video.creation_story.attachments[].media
'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/',
'only_matching': True, 'only_matching': True,
}, { }, {
# data.video
'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',
'only_matching': True, 'only_matching': True,
}, { }, {
@ -212,6 +223,7 @@ class FacebookIE(InfoExtractor):
'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
'only_matching': True, 'only_matching': True,
}, { }, {
# data.video
'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/',
'info_dict': { 'info_dict': {
'id': '359649331226507', 'id': '359649331226507',
@ -222,7 +234,54 @@ class FacebookIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, {
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/',
'info_dict': {
'id': '106560053808006',
},
'playlist_count': 2,
}, {
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/watch/?v=647537299265662',
'only_matching': True,
}, {
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271',
'info_dict': {
'id': '10157667649866271',
},
'playlist_count': 3,
}, {
# data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330',
'info_dict': {
'id': '117576630041613',
'ext': 'mp4',
# TODO: title can be extracted from video page
'title': 'Facebook video #117576630041613',
'uploader_id': '189393014416438',
'upload_date': '20201123',
'timestamp': 1606162592,
},
'skip': 'Requires logging in',
}, {
# node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/',
'info_dict': {
'id': '211567722618337',
'ext': 'mp4',
'title': 'Facebook video #211567722618337',
'uploader_id': '127875227654254',
'upload_date': '20161122',
'timestamp': 1479793574,
},
}, {
# data.video.creation_story.attachments[].media
'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
'only_matching': True,
}] }]
_SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage):
@ -305,23 +364,24 @@ class FacebookIE(InfoExtractor):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _extract_from_url(self, url, video_id, fatal_if_no_video=True): def _extract_from_url(self, url, video_id):
req = sanitized_Request(url) webpage = self._download_webpage(
req.add_header('User-Agent', self._CHROME_USER_AGENT) url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
webpage = self._download_webpage(req, video_id)
video_data = None video_data = None
def extract_video_data(instances): def extract_video_data(instances):
video_data = []
for item in instances: for item in instances:
if item[1][0] == 'VideoConfig': if try_get(item, lambda x: x[1][0]) == 'VideoConfig':
video_item = item[2][0] video_item = item[2][0]
if video_item.get('video_id'): if video_item.get('video_id'):
return video_item['videoData'] video_data.append(video_item['videoData'])
return video_data
server_js_data = self._parse_json(self._search_regex( server_js_data = self._parse_json(self._search_regex(
r'handleServerJS\(({.+})(?:\);|,")', webpage, [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'],
'server js data', default='{}'), video_id, fatal=False) webpage, 'server js data', default='{}'), video_id, fatal=False)
if server_js_data: if server_js_data:
video_data = extract_video_data(server_js_data.get('instances', [])) video_data = extract_video_data(server_js_data.get('instances', []))
@ -331,17 +391,111 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get( return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats):
dash_manifest = video.get('dash_manifest')
if dash_manifest:
formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
def process_formats(formats):
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
for f in formats:
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
self._sort_formats(formats)
if not video_data: if not video_data:
server_js_data = self._parse_json( server_js_data = self._parse_json(self._search_regex([
self._search_regex( r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX
webpage, 'js data', default='{}'), ], webpage, 'js data', default='{}'), video_id, js_to_json, False)
video_id, transform_source=js_to_json, fatal=False)
video_data = extract_from_jsmods_instances(server_js_data) video_data = extract_from_jsmods_instances(server_js_data)
if not video_data: if not video_data:
if not fatal_if_no_video: graphql_data = self._parse_json(self._search_regex(
return webpage, False r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);',
webpage, 'graphql data', default='{}'), video_id, fatal=False) or {}
for require in (graphql_data.get('require') or []):
if require[0] == 'RelayPrefetchedStreamCache':
entries = []
def parse_graphql_video(video):
formats = []
q = qualities(['sd', 'hd'])
for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
playable_url = video.get('playable_url' + suffix)
if not playable_url:
continue
formats.append({
'format_id': format_id,
'quality': q(format_id),
'url': playable_url,
})
extract_dash_manifest(video, formats)
process_formats(formats)
v_id = video.get('videoId') or video.get('id') or video_id
info = {
'id': v_id,
'formats': formats,
'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
'uploader_id': try_get(video, lambda x: x['owner']['id']),
'timestamp': int_or_none(video.get('publish_time')),
'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
}
description = try_get(video, lambda x: x['savable_description']['text'])
title = video.get('name')
if title:
info.update({
'title': title,
'description': description,
})
else:
info['title'] = description or 'Facebook video #%s' % v_id
entries.append(info)
def parse_attachment(attachment, key='media'):
media = attachment.get(key) or {}
if media.get('__typename') == 'Video':
return parse_graphql_video(media)
data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
nodes = data.get('nodes') or []
node = data.get('node') or {}
if not nodes and node:
nodes.append(node)
for node in nodes:
story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
attachments = try_get(story, [
lambda x: x['attached_story']['attachments'],
lambda x: x['attachments']
], list) or []
for attachment in attachments:
attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
for n in ns:
parse_attachment(n)
parse_attachment(attachment)
edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
for edge in edges:
parse_attachment(edge, key='node')
video = data.get('video') or {}
if video:
attachments = try_get(video, [
lambda x: x['story']['attachments'],
lambda x: x['creation_story']['attachments']
], list) or []
for attachment in attachments:
parse_attachment(attachment)
if not entries:
parse_graphql_video(video)
return self.playlist_result(entries, video_id)
if not video_data:
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
if m_msg is not None: if m_msg is not None:
raise ExtractorError( raise ExtractorError(
@ -379,8 +533,19 @@ class FacebookIE(InfoExtractor):
if not video_data: if not video_data:
raise ExtractorError('Cannot parse data') raise ExtractorError('Cannot parse data')
subtitles = {} if len(video_data) > 1:
entries = []
for v in video_data:
video_url = v[0].get('video_url')
if not video_url:
continue
entries.append(self.url_result(urljoin(
url, video_url), self.ie_key(), v[0].get('video_id')))
return self.playlist_result(entries, video_id)
video_data = video_data[0]
formats = [] formats = []
subtitles = {}
for f in video_data: for f in video_data:
format_id = f['stream_type'] format_id = f['stream_type']
if f and isinstance(f, dict): if f and isinstance(f, dict):
@ -399,22 +564,14 @@ class FacebookIE(InfoExtractor):
'url': src, 'url': src,
'preference': preference, 'preference': preference,
}) })
dash_manifest = f[0].get('dash_manifest') extract_dash_manifest(f[0], formats)
if dash_manifest:
formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
subtitles_src = f[0].get('subtitles_src') subtitles_src = f[0].get('subtitles_src')
if subtitles_src: if subtitles_src:
subtitles.setdefault('en', []).append({'url': subtitles_src}) subtitles.setdefault('en', []).append({'url': subtitles_src})
if not formats: if not formats:
raise ExtractorError('Cannot find video formats') raise ExtractorError('Cannot find video formats')
# Downloads with browser's User-Agent are rate limited. Working around process_formats(formats)
# with non-browser User-Agent.
for f in formats:
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
self._sort_formats(formats)
video_title = self._html_search_regex( video_title = self._html_search_regex(
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
@ -454,35 +611,13 @@ class FacebookIE(InfoExtractor):
'subtitles': subtitles, 'subtitles': subtitles,
} }
return webpage, info_dict return info_dict
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False) return self._extract_from_url(real_url, video_id)
if info_dict:
return info_dict
if '/posts/' in url:
video_id_json = self._search_regex(
r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids',
default='')
if video_id_json:
entries = [
self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
for vid in self._parse_json(video_id_json, video_id)]
return self.playlist_result(entries, video_id)
# Single Video?
video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id')
return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
else:
_, info_dict = self._extract_from_url(
self._VIDEO_PAGE_TEMPLATE % video_id,
video_id, fatal_if_no_video=True)
return info_dict
class FacebookPluginsVideoIE(InfoExtractor): class FacebookPluginsVideoIE(InfoExtractor):

@ -1,30 +1,22 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import uuid
import xml.etree.ElementTree as etree
import json import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .brightcove import BrightcoveNewIE from .brightcove import BrightcoveNewIE
from ..compat import (
compat_str,
compat_etree_register_namespace,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError,
extract_attributes, extract_attributes,
int_or_none, get_element_by_class,
JSON_LD_RE,
merge_dicts, merge_dicts,
parse_duration, parse_duration,
smuggle_url, smuggle_url,
strip_or_none,
try_get, try_get,
url_or_none, url_or_none,
xpath_with_ns,
xpath_element,
xpath_text,
) )
@ -32,14 +24,18 @@ class ITVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
_GEO_COUNTRIES = ['GB'] _GEO_COUNTRIES = ['GB']
_TESTS = [{ _TESTS = [{
'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', 'url': 'https://www.itv.com/hub/liar/2a4547a0012',
'info_dict': { 'info_dict': {
'id': '2a2936a0053', 'id': '2a4547a0012',
'ext': 'flv', 'ext': 'mp4',
'title': 'Home Movie', 'title': 'Liar - Series 2 - Episode 6',
'description': 'md5:d0f91536569dec79ea184f0a44cca089',
'series': 'Liar',
'season_number': 2,
'episode_number': 6,
}, },
'params': { 'params': {
# rtmp download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
@ -62,220 +58,97 @@ class ITVIE(InfoExtractor):
params = extract_attributes(self._search_regex( params = extract_attributes(self._search_regex(
r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
ns_map = { ios_playlist_url = params.get('data-video-playlist') or params['data-video-id']
'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/', hmac = params['data-video-hmac']
'tem': 'http://tempuri.org/',
'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types',
'com': 'http://schemas.itv.com/2009/05/Common',
}
for ns, full_ns in ns_map.items():
compat_etree_register_namespace(ns, full_ns)
def _add_ns(name):
return xpath_with_ns(name, ns_map)
def _add_sub_element(element, name):
return etree.SubElement(element, _add_ns(name))
production_id = (
params.get('data-video-autoplay-id')
or '%s#001' % (
params.get('data-video-episode-id')
or video_id.replace('a', '/')))
req_env = etree.Element(_add_ns('soapenv:Envelope'))
_add_sub_element(req_env, 'soapenv:Header')
body = _add_sub_element(req_env, 'soapenv:Body')
get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
request = _add_sub_element(get_playlist, 'tem:request')
_add_sub_element(request, 'itv:ProductionId').text = production_id
_add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
vodcrid = _add_sub_element(request, 'itv:Vodcrid')
_add_sub_element(vodcrid, 'com:Id')
_add_sub_element(request, 'itv:Partition')
user_info = _add_sub_element(get_playlist, 'tem:userInfo')
_add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv'
_add_sub_element(user_info, 'itv:DM')
_add_sub_element(user_info, 'itv:RevenueScienceValue')
_add_sub_element(user_info, 'itv:SessionId')
_add_sub_element(user_info, 'itv:SsoToken')
_add_sub_element(user_info, 'itv:UserToken')
site_info = _add_sub_element(get_playlist, 'tem:siteInfo')
_add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None'
_add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV'
_add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any'
_add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO'
_add_sub_element(site_info, 'itv:Category')
_add_sub_element(site_info, 'itv:Platform').text = 'DotCom'
_add_sub_element(site_info, 'itv:Site').text = 'ItvCom'
device_info = _add_sub_element(get_playlist, 'tem:deviceInfo')
_add_sub_element(device_info, 'itv:ScreenSize').text = 'Big'
player_info = _add_sub_element(get_playlist, 'tem:playerInfo')
_add_sub_element(player_info, 'itv:Version').text = '2'
headers = self.geo_verification_headers() headers = self.geo_verification_headers()
headers.update({ headers.update({
'Content-Type': 'text/xml; charset=utf-8', 'Accept': 'application/vnd.itv.vod.playlist.v2+json',
'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', 'Content-Type': 'application/json',
'hmac': hmac.upper(),
}) })
ios_playlist = self._download_json(
ios_playlist_url, video_id, data=json.dumps({
'user': {
'itvUserId': '',
'entitlements': [],
'token': ''
},
'device': {
'manufacturer': 'Safari',
'model': '5',
'os': {
'name': 'Windows NT',
'version': '6.1',
'type': 'desktop'
}
},
'client': {
'version': '4.1',
'id': 'browser'
},
'variantAvailability': {
'featureset': {
'min': ['hls', 'aes', 'outband-webvtt'],
'max': ['hls', 'aes', 'outband-webvtt']
},
'platformTag': 'dotcom'
}
}).encode(), headers=headers)
video_data = ios_playlist['Playlist']['Video']
ios_base_url = video_data.get('Base')
info = self._search_json_ld(webpage, video_id, default={})
formats = [] formats = []
subtitles = {} for media_file in (video_data.get('MediaFiles') or []):
href = media_file.get('Href')
def extract_subtitle(sub_url): if not href:
ext = determine_ext(sub_url, 'ttml') continue
subtitles.setdefault('en', []).append({ if ios_base_url:
'url': sub_url, href = ios_base_url + href
'ext': 'ttml' if ext == 'xml' else ext, ext = determine_ext(href)
}) if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
resp_env = self._download_xml( href, video_id, 'mp4', entry_protocol='m3u8_native',
params['data-playlist-url'], video_id, m3u8_id='hls', fatal=False))
headers=headers, data=etree.tostring(req_env), fatal=False)
if resp_env:
playlist = xpath_element(resp_env, './/Playlist')
if playlist is None:
fault_code = xpath_text(resp_env, './/faultcode')
fault_string = xpath_text(resp_env, './/faultstring')
if fault_code == 'InvalidGeoRegion':
self.raise_geo_restricted(
msg=fault_string, countries=self._GEO_COUNTRIES)
elif fault_code not in (
'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, fault_string), expected=True)
info.update({
'title': self._og_search_title(webpage),
'episode_title': params.get('data-video-episode'),
'series': params.get('data-video-title'),
})
else: else:
title = xpath_text(playlist, 'EpisodeTitle', default=None) formats.append({
info.update({ 'url': href,
'title': title,
'episode_title': title,
'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
'series': xpath_text(playlist, 'ProgrammeTitle'),
'duration': parse_duration(xpath_text(playlist, 'Duration')),
}) })
video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) self._sort_formats(formats)
media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
rtmp_url = media_files.attrib['base']
for media_file in media_files.findall('MediaFile'):
play_path = xpath_text(media_file, 'URL')
if not play_path:
continue
tbr = int_or_none(media_file.get('bitrate'), 1000)
f = {
'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
'play_path': play_path,
# Providing this swfVfy allows to avoid truncated downloads
'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
'page_url': url,
'tbr': tbr,
'ext': 'flv',
}
app = self._search_regex(
'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
if app:
f.update({
'url': rtmp_url.split('?', 1)[0],
'app': app,
})
else:
f['url'] = rtmp_url
formats.append(f)
for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
if caption_url.text:
extract_subtitle(caption_url.text)
ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') subtitles = {}
hmac = params.get('data-video-hmac') subs = video_data.get('Subtitles') or []
if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url): for sub in subs:
headers = self.geo_verification_headers() if not isinstance(sub, dict):
headers.update({ continue
'Accept': 'application/vnd.itv.vod.playlist.v2+json', href = url_or_none(sub.get('Href'))
'Content-Type': 'application/json', if not href:
'hmac': hmac.upper(), continue
subtitles.setdefault('en', []).append({
'url': href,
'ext': determine_ext(href, 'vtt'),
}) })
ios_playlist = self._download_json(
ios_playlist_url, video_id, data=json.dumps({
'user': {
'itvUserId': '',
'entitlements': [],
'token': ''
},
'device': {
'manufacturer': 'Safari',
'model': '5',
'os': {
'name': 'Windows NT',
'version': '6.1',
'type': 'desktop'
}
},
'client': {
'version': '4.1',
'id': 'browser'
},
'variantAvailability': {
'featureset': {
'min': ['hls', 'aes', 'outband-webvtt'],
'max': ['hls', 'aes', 'outband-webvtt']
},
'platformTag': 'dotcom'
}
}).encode(), headers=headers, fatal=False)
if ios_playlist:
video_data = ios_playlist.get('Playlist', {}).get('Video', {})
ios_base_url = video_data.get('Base')
for media_file in video_data.get('MediaFiles', []):
href = media_file.get('Href')
if not href:
continue
if ios_base_url:
href = ios_base_url + href
ext = determine_ext(href)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
href, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
else:
formats.append({
'url': href,
})
subs = video_data.get('Subtitles')
if isinstance(subs, list):
for sub in subs:
if not isinstance(sub, dict):
continue
href = url_or_none(sub.get('Href'))
if href:
extract_subtitle(href)
if not info.get('duration'):
info['duration'] = parse_duration(video_data.get('Duration'))
self._sort_formats(formats)
info.update({ info = self._search_json_ld(webpage, video_id, default={})
if not info:
json_ld = self._parse_json(self._search_regex(
JSON_LD_RE, webpage, 'JSON-LD', '{}',
group='json_ld'), video_id, fatal=False)
if json_ld and json_ld.get('@type') == 'BreadcrumbList':
for ile in (json_ld.get('itemListElement:') or []):
item = ile.get('item:') or {}
if item.get('@type') == 'TVEpisode':
item['@context'] = 'http://schema.org'
info = self._json_ld(item, video_id, fatal=False) or {}
break
return merge_dicts({
'id': video_id, 'id': video_id,
'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
}) 'duration': parse_duration(video_data.get('Duration')),
'description': strip_or_none(get_element_by_class('episode-info__synopsis', webpage)),
webpage_info = self._search_json_ld(webpage, video_id, default={}) }, info)
if not webpage_info.get('title'):
webpage_info['title'] = self._html_search_regex(
r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
webpage, 'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
default=None) or webpage_info['episode']
return merge_dicts(info, webpage_info)
class ITVBTCCIE(InfoExtractor): class ITVBTCCIE(InfoExtractor):

@ -6,14 +6,24 @@ from ..compat import compat_urllib_parse_urlparse
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
find_xpath_attr,
int_or_none, int_or_none,
unified_strdate,
url_or_none,
xpath_attr, xpath_attr,
xpath_text, xpath_text,
) )
class RuutuIE(InfoExtractor): class RuutuIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P<id>\d+)' _VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/|
static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid=
)
(?P<id>\d+)
'''
_TESTS = [ _TESTS = [
{ {
'url': 'http://www.ruutu.fi/video/2058907', 'url': 'http://www.ruutu.fi/video/2058907',
@ -71,15 +81,53 @@ class RuutuIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'age_limit': 0, 'age_limit': 0,
}, },
'expected_warnings': ['HTTP Error 502: Bad Gateway'], 'expected_warnings': [
} 'HTTP Error 502: Bad Gateway',
'Failed to download m3u8 information',
],
},
{
'url': 'http://www.supla.fi/audio/2231370',
'only_matching': True,
},
{
'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790',
'only_matching': True,
},
{
# episode
'url': 'https://www.ruutu.fi/video/3401964',
'info_dict': {
'id': '3401964',
'ext': 'mp4',
'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17',
'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2582,
'age_limit': 12,
'upload_date': '20190508',
'series': 'Temptation Island Suomi',
'season_number': 5,
'episode_number': 17,
'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'],
},
'params': {
'skip_download': True,
},
},
{
# premium
'url': 'https://www.ruutu.fi/video/3618715',
'only_matching': True,
},
] ]
_API_BASE = 'https://gatling.nelonenmedia.fi'
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_xml = self._download_xml( video_xml = self._download_xml(
'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, '%s/media-xml-cache' % self._API_BASE, video_id,
query={'id': video_id}) query={'id': video_id})
formats = [] formats = []
@ -96,9 +144,18 @@ class RuutuIE(InfoExtractor):
continue continue
processed_urls.append(video_url) processed_urls.append(video_url)
ext = determine_ext(video_url) ext = determine_ext(video_url)
auth_video_url = url_or_none(self._download_webpage(
'%s/auth/access/v2' % self._API_BASE, video_id,
note='Downloading authenticated %s stream URL' % ext,
fatal=False, query={'stream': video_url}))
if auth_video_url:
processed_urls.append(auth_video_url)
video_url = auth_video_url
if ext == 'm3u8': if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) video_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
elif ext == 'f4m': elif ext == 'f4m':
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id='hds', fatal=False)) video_url, video_id, f4m_id='hds', fatal=False))
@ -136,18 +193,35 @@ class RuutuIE(InfoExtractor):
extract_formats(video_xml.find('./Clip')) extract_formats(video_xml.find('./Clip'))
drm = xpath_text(video_xml, './Clip/DRM', default=None) def pv(name):
if not formats and drm: node = find_xpath_attr(
raise ExtractorError('This video is DRM protected.', expected=True) video_xml, './Clip/PassthroughVariables/variable', 'name', name)
if node is not None:
return node.get('value')
if not formats:
drm = xpath_text(video_xml, './Clip/DRM', default=None)
if drm:
raise ExtractorError('This video is DRM protected.', expected=True)
ns_st_cds = pv('ns_st_cds')
if ns_st_cds != 'free':
raise ExtractorError('This video is %s.' % ns_st_cds, expected=True)
self._sort_formats(formats) self._sort_formats(formats)
themes = pv('themes')
return { return {
'id': video_id, 'id': video_id,
'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True),
'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'),
'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'),
'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')),
'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
'upload_date': unified_strdate(pv('date_start')),
'series': pv('series_name'),
'season_number': int_or_none(pv('season_number')),
'episode_number': int_or_none(pv('episode_number')),
'categories': themes.split(',') if themes else [],
'formats': formats, 'formats': formats,
} }

@ -17,6 +17,7 @@ from ..utils import (
unified_strdate, unified_strdate,
update_url_query, update_url_query,
urlhandle_detect_ext, urlhandle_detect_ext,
url_or_none,
) )
@ -42,15 +43,15 @@ class WDRIE(InfoExtractor):
is_live = metadata.get('mediaType') == 'live' is_live = metadata.get('mediaType') == 'live'
tracker_data = metadata['trackerData'] tracker_data = metadata['trackerData']
title = tracker_data['trackerClipTitle']
media_resource = metadata['mediaResource'] media_resource = metadata['mediaResource']
formats = [] formats = []
subtitles = {}
# check if the metadata contains a direct URL to a file # check if the metadata contains a direct URL to a file
for kind, media_resource in media_resource.items(): for kind, media in media_resource.items():
if kind == 'captionsHash': if kind == 'captionsHash':
for ext, url in media_resource.items(): for ext, url in media.items():
subtitles.setdefault('de', []).append({ subtitles.setdefault('de', []).append({
'url': url, 'url': url,
'ext': ext, 'ext': ext,
@ -59,8 +60,10 @@ class WDRIE(InfoExtractor):
if kind not in ('dflt', 'alt'): if kind not in ('dflt', 'alt'):
continue continue
if not isinstance(media, dict):
continue
for tag_name, medium_url in media_resource.items(): for tag_name, medium_url in media.items():
if tag_name not in ('videoURL', 'audioURL'): if tag_name not in ('videoURL', 'audioURL'):
continue continue
@ -90,7 +93,23 @@ class WDRIE(InfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
title = tracker_data['trackerClipTitle'] subtitles = {}
caption_url = media_resource.get('captionURL')
if caption_url:
subtitles['de'] = [{
'url': caption_url,
'ext': 'ttml',
}]
captions_hash = media_resource.get('captionsHash')
if isinstance(captions_hash, dict):
for ext, format_url in captions_hash.items():
format_url = url_or_none(format_url)
if not format_url:
continue
subtitles.setdefault('de', []).append({
'url': format_url,
'ext': determine_ext(format_url, None) or ext,
})
return { return {
'id': tracker_data.get('trackerClipId', video_id), 'id': tracker_data.get('trackerClipId', video_id),
@ -106,7 +125,7 @@ class WDRIE(InfoExtractor):
class WDRPageIE(InfoExtractor): class WDRPageIE(InfoExtractor):
_CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
_PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html'
_VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
_TESTS = [ _TESTS = [
{ {
@ -213,7 +232,11 @@ class WDRPageIE(InfoExtractor):
{ {
'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html',
'only_matching': True, 'only_matching': True,
} },
{
'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html',
'only_matching': True,
},
] ]
def _real_extract(self, url): def _real_extract(self, url):

Loading…
Cancel
Save