diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index b38b01c28..45ee65728 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -112,6 +112,7 @@
- **blinkx**
- **Bloomberg**
- **BokeCC**
+ - **BongaCams**
- **BostonGlobe**
- **Box**
- **Bpb**: Bundeszentrale für politische Bildung
@@ -146,6 +147,7 @@
- **CBS**
- **CBSInteractive**
- **CBSLocal**
+ - **CBSLocalArticle**
- **cbsnews**: CBS News
- **cbsnews:embed**
- **cbsnews:livevideo**: CBS News Live Videos
@@ -198,6 +200,7 @@
- **CSNNE**
- **CSpan**: C-SPAN
- **CtsNews**: 華視新聞
+ - **CTV**
- **CTVNews**
- **cu.ntv.co.jp**: Nippon Television Network
- **Culturebox**
@@ -1119,6 +1122,7 @@
- **WeiboMobile**
- **WeiqiTV**: WQTV
- **Wistia**
+ - **WistiaPlaylist**
- **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **WorldStarHipHop**
- **WSJ**: Wall Street Journal
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index 8dcdc4e58..130038c0d 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -39,7 +39,7 @@ class TestAllURLsMatching(unittest.TestCase):
assertTab('https://www.youtube.com/embedded')
assertTab('https://www.youtube.com/feed') # Own channel's home page
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
- assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
+ assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
@@ -60,8 +60,8 @@ class TestAllURLsMatching(unittest.TestCase):
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
- # def test_youtube_user_matching(self):
- # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
+ def test_youtube_user_matching(self):
+ self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab'])
diff --git a/youtube_dlc/extractor/bbc.py b/youtube_dlc/extractor/bbc.py
index 54cbcdc8e..b4daee54e 100644
--- a/youtube_dlc/extractor/bbc.py
+++ b/youtube_dlc/extractor/bbc.py
@@ -49,22 +49,17 @@ class BBCCoUkIE(InfoExtractor):
_LOGIN_URL = 'https://account.bbc.com/signin'
_NETRC_MACHINE = 'bbc'
- _MEDIASELECTOR_URLS = [
+ _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
+ _MEDIA_SETS = [
# Provides HQ HLS streams with even better quality that pc mediaset but fails
# with geolocation in some cases when it's even not geo restricted at all (e.g.
# http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+ 'iptv-all',
+ 'pc',
]
- _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
_EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
- _NAMESPACES = (
- _MEDIASELECTION_NS,
- _EMP_PLAYLIST_NS,
- )
-
_TESTS = [
{
'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
@@ -261,8 +256,6 @@ class BBCCoUkIE(InfoExtractor):
'only_matching': True,
}]
- _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
-
def _login(self):
username, password = self._get_login_info()
if username is None:
@@ -307,22 +300,14 @@ class BBCCoUkIE(InfoExtractor):
def _extract_items(self, playlist):
return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
- def _findall_ns(self, element, xpath):
- elements = []
- for ns in self._NAMESPACES:
- elements.extend(element.findall(xpath % ns))
- return elements
-
def _extract_medias(self, media_selection):
- error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
- if error is None:
- media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
- if error is not None:
- raise BBCCoUkIE.MediaSelectionError(error.get('id'))
- return self._findall_ns(media_selection, './{%s}media')
+ error = media_selection.get('result')
+ if error:
+ raise BBCCoUkIE.MediaSelectionError(error)
+ return media_selection.get('media') or []
def _extract_connections(self, media):
- return self._findall_ns(media, './{%s}connection')
+ return media.get('connection') or []
def _get_subtitles(self, media, programme_id):
subtitles = {}
@@ -334,13 +319,13 @@ class BBCCoUkIE(InfoExtractor):
cc_url, programme_id, 'Downloading captions', fatal=False)
if not isinstance(captions, compat_etree_Element):
continue
- lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
- subtitles[lang] = [
+ subtitles['en'] = [
{
'url': connection.get('href'),
'ext': 'ttml',
},
]
+ break
return subtitles
def _raise_extractor_error(self, media_selection_error):
@@ -350,10 +335,10 @@ class BBCCoUkIE(InfoExtractor):
def _download_media_selector(self, programme_id):
last_exception = None
- for mediaselector_url in self._MEDIASELECTOR_URLS:
+ for media_set in self._MEDIA_SETS:
try:
return self._download_media_selector_url(
- mediaselector_url % programme_id, programme_id)
+ self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
except BBCCoUkIE.MediaSelectionError as e:
if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
last_exception = e
@@ -362,8 +347,8 @@ class BBCCoUkIE(InfoExtractor):
self._raise_extractor_error(last_exception)
def _download_media_selector_url(self, url, programme_id=None):
- media_selection = self._download_xml(
- url, programme_id, 'Downloading media selection XML',
+ media_selection = self._download_json(
+ url, programme_id, 'Downloading media selection JSON',
expected_status=(403, 404))
return self._process_media_selector(media_selection, programme_id)
@@ -377,7 +362,6 @@ class BBCCoUkIE(InfoExtractor):
if kind in ('video', 'audio'):
bitrate = int_or_none(media.get('bitrate'))
encoding = media.get('encoding')
- service = media.get('service')
width = int_or_none(media.get('width'))
height = int_or_none(media.get('height'))
file_size = int_or_none(media.get('media_file_size'))
@@ -392,8 +376,6 @@ class BBCCoUkIE(InfoExtractor):
supplier = connection.get('supplier')
transfer_format = connection.get('transferFormat')
format_id = supplier or conn_kind or protocol
- if service:
- format_id = '%s_%s' % (service, format_id)
# ASX playlist
if supplier == 'asx':
for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
@@ -408,20 +390,11 @@ class BBCCoUkIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
href, programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False))
- if re.search(self._USP_RE, href):
- usp_formats = self._extract_m3u8_formats(
- re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
- programme_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id=format_id, fatal=False)
- for f in usp_formats:
- if f.get('height') and f['height'] > 720:
- continue
- formats.append(f)
elif transfer_format == 'hds':
formats.extend(self._extract_f4m_formats(
href, programme_id, f4m_id=format_id, fatal=False))
else:
- if not service and not supplier and bitrate:
+ if not supplier and bitrate:
format_id += '-%d' % bitrate
fmt = {
'format_id': format_id,
@@ -554,7 +527,7 @@ class BBCCoUkIE(InfoExtractor):
webpage = self._download_webpage(url, group_id, 'Downloading video page')
error = self._search_regex(
- r'
]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
+ r'
]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
webpage, 'error', default=None)
if error:
raise ExtractorError(error, expected=True)
@@ -607,16 +580,9 @@ class BBCIE(BBCCoUkIE):
IE_DESC = 'BBC'
_VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P
[^/#?]+)'
- _MEDIASELECTOR_URLS = [
- # Provides HQ HLS streams but fails with geolocation in some cases when it's
- # even not geo restricted at all
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
- # Provides more formats, namely direct mp4 links, but fails on some videos with
- # notukerror for non UK (?) users (e.g.
- # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
- 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
- # Provides fewer formats, but works everywhere for everybody (hopefully)
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+ _MEDIA_SETS = [
+ 'mobile-tablet-main',
+ 'pc',
]
_TESTS = [{
diff --git a/youtube_dlc/extractor/bongacams.py b/youtube_dlc/extractor/bongacams.py
new file mode 100644
index 000000000..180542fbc
--- /dev/null
+++ b/youtube_dlc/extractor/bongacams.py
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class BongaCamsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.com)/(?P[^/?]+)'
+ _TESTS = [{
+ 'url': 'https://de.bongacams.com/azumi-8',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://cn.bongacams.com/azumi-8',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ host = mobj.group('host')
+ channel_id = mobj.group('id')
+
+ amf = self._download_json(
+ 'https://%s/tools/amf.php' % host, channel_id,
+ data=urlencode_postdata((
+ ('method', 'getRoomData'),
+ ('args[]', channel_id),
+ ('args[]', 'false'),
+ )), headers={'X-Requested-With': 'XMLHttpRequest'})
+
+ server_url = amf['localData']['videoServerUrl']
+
+ uploader_id = try_get(
+ amf, lambda x: x['performerData']['username'], compat_str) or channel_id
+ uploader = try_get(
+ amf, lambda x: x['performerData']['displayName'], compat_str)
+ like_count = int_or_none(try_get(
+ amf, lambda x: x['performerData']['loversCount']))
+
+ formats = self._extract_m3u8_formats(
+ '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id),
+ channel_id, 'mp4', m3u8_id='hls', live=True)
+ self._sort_formats(formats)
+
+ return {
+ 'id': channel_id,
+ 'title': self._live_title(uploader or uploader_id),
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'like_count': like_count,
+ 'age_limit': 18,
+ 'is_live': True,
+ 'formats': formats,
+ }
diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py
index b5d94d60a..6e68f0960 100644
--- a/youtube_dlc/extractor/extractors.py
+++ b/youtube_dlc/extractor/extractors.py
@@ -127,6 +127,7 @@ from .bleacherreport import (
from .blinkx import BlinkxIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
+from .bongacams import BongaCamsIE
from .bostonglobe import BostonGlobeIE
from .box import BoxIE
from .bpb import BpbIE
@@ -1492,7 +1493,10 @@ from .weibo import (
WeiboMobileIE
)
from .weiqitv import WeiqiTVIE
-from .wistia import WistiaIE
+from .wistia import (
+ WistiaIE,
+ WistiaPlaylistIE,
+)
from .worldstarhiphop import WorldStarHipHopIE
from .wsj import (
WSJIE,
diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py
index f7b5c67bb..81c2ae650 100644
--- a/youtube_dlc/extractor/generic.py
+++ b/youtube_dlc/extractor/generic.py
@@ -2022,22 +2022,6 @@ class GenericIE(InfoExtractor):
},
'add_ie': [SpringboardPlatformIE.ie_key()],
},
- {
- 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
- 'info_dict': {
- 'id': 'uPDB5I9wfp8',
- 'ext': 'webm',
- 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
- 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
- 'upload_date': '20160219',
- 'uploader': 'Pocoyo - Português (BR)',
- 'uploader_id': 'PocoyoBrazil',
- },
- 'add_ie': [YoutubeIE.ie_key()],
- 'params': {
- 'skip_download': True,
- },
- },
{
'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html',
'info_dict': {
diff --git a/youtube_dlc/extractor/instagram.py b/youtube_dlc/extractor/instagram.py
index c3eba0114..1eeddc3b6 100644
--- a/youtube_dlc/extractor/instagram.py
+++ b/youtube_dlc/extractor/instagram.py
@@ -22,7 +22,7 @@ from ..utils import (
class InstagramIE(InfoExtractor):
- _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv)/(?P[^/?#&]+))'
+ _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P[^/?#&]+))'
_TESTS = [{
'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
@@ -35,7 +35,7 @@ class InstagramIE(InfoExtractor):
'timestamp': 1371748545,
'upload_date': '20130620',
'uploader_id': 'naomipq',
- 'uploader': 'Naomi Leonor Phan-Quang',
+ 'uploader': 'B E A U T Y F O R A S H E S',
'like_count': int,
'comment_count': int,
'comments': list,
@@ -95,6 +95,9 @@ class InstagramIE(InfoExtractor):
}, {
'url': 'https://www.instagram.com/tv/aye83DjauH/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
+ 'only_matching': True,
}]
@staticmethod
@@ -122,81 +125,92 @@ class InstagramIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- (video_url, description, thumbnail, timestamp, uploader,
+ (media, video_url, description, thumbnail, timestamp, uploader,
uploader_id, like_count, comment_count, comments, height,
- width) = [None] * 11
-
- shared_data = try_get(webpage,
- (lambda x: self._parse_json(
- self._search_regex(
- r'window\.__additionalDataLoaded\(\'/(?:p|tv)/(?:[^/?#&]+)/\',({.+?})\);',
- x, 'additional data', default='{}'),
- video_id, fatal=False),
- lambda x: self._parse_json(
- self._search_regex(
- r'window\._sharedData\s*=\s*({.+?});',
- x, 'shared data', default='{}'),
- video_id, fatal=False)['entry_data']['PostPage'][0]),
- None)
+ width) = [None] * 12
+
+ shared_data = self._parse_json(
+ self._search_regex(
+ r'window\._sharedData\s*=\s*({.+?});',
+ webpage, 'shared data', default='{}'),
+ video_id, fatal=False)
if shared_data:
media = try_get(
shared_data,
- (lambda x: x['graphql']['shortcode_media'],
- lambda x: x['media']),
+ (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
+ lambda x: x['entry_data']['PostPage'][0]['media']),
dict)
- if media:
- video_url = media.get('video_url')
- height = int_or_none(media.get('dimensions', {}).get('height'))
- width = int_or_none(media.get('dimensions', {}).get('width'))
- description = try_get(
- media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
- compat_str) or media.get('caption')
- thumbnail = media.get('display_src') or media.get('thumbnail_src')
- timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
- uploader = media.get('owner', {}).get('full_name')
- uploader_id = media.get('owner', {}).get('username')
-
- def get_count(key, kind):
- return int_or_none(try_get(
+ # _sharedData.entry_data.PostPage is empty when authenticated (see
+ # https://github.com/ytdl-org/youtube-dl/pull/22880)
+ if not media:
+ additional_data = self._parse_json(
+ self._search_regex(
+ r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
+ webpage, 'additional data', default='{}'),
+ video_id, fatal=False)
+ if additional_data:
+ media = try_get(
+ additional_data, lambda x: x['graphql']['shortcode_media'],
+ dict)
+ if media:
+ video_url = media.get('video_url')
+ height = int_or_none(media.get('dimensions', {}).get('height'))
+ width = int_or_none(media.get('dimensions', {}).get('width'))
+ description = try_get(
+ media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
+ compat_str) or media.get('caption')
+ thumbnail = media.get('display_src') or media.get('display_url')
+ timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
+ uploader = media.get('owner', {}).get('full_name')
+ uploader_id = media.get('owner', {}).get('username')
+
+ def get_count(keys, kind):
+ if not isinstance(keys, (list, tuple)):
+ keys = [keys]
+ for key in keys:
+ count = int_or_none(try_get(
media, (lambda x: x['edge_media_%s' % key]['count'],
lambda x: x['%ss' % kind]['count'])))
- like_count = get_count('preview_like', 'like')
- comment_count = get_count('to_comment', 'comment')
-
- comments = [{
- 'author': comment.get('user', {}).get('username'),
- 'author_id': comment.get('user', {}).get('id'),
- 'id': comment.get('id'),
- 'text': comment.get('text'),
- 'timestamp': int_or_none(comment.get('created_at')),
- } for comment in media.get(
- 'comments', {}).get('nodes', []) if comment.get('text')]
- if not video_url:
- edges = try_get(
- media, lambda x: x['edge_sidecar_to_children']['edges'],
- list) or []
- if edges:
- entries = []
- for edge_num, edge in enumerate(edges, start=1):
- node = try_get(edge, lambda x: x['node'], dict)
- if not node:
- continue
- node_video_url = url_or_none(node.get('video_url'))
- if not node_video_url:
- continue
- entries.append({
- 'id': node.get('shortcode') or node['id'],
- 'title': 'Video %d' % edge_num,
- 'url': node_video_url,
- 'thumbnail': node.get('display_url'),
- 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
- 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
- 'view_count': int_or_none(node.get('video_view_count')),
- })
- return self.playlist_result(
- entries, video_id,
- 'Post by %s' % uploader_id if uploader_id else None,
- description)
+ if count is not None:
+ return count
+ like_count = get_count('preview_like', 'like')
+ comment_count = get_count(
+ ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment')
+
+ comments = [{
+ 'author': comment.get('user', {}).get('username'),
+ 'author_id': comment.get('user', {}).get('id'),
+ 'id': comment.get('id'),
+ 'text': comment.get('text'),
+ 'timestamp': int_or_none(comment.get('created_at')),
+ } for comment in media.get(
+ 'comments', {}).get('nodes', []) if comment.get('text')]
+ if not video_url:
+ edges = try_get(
+ media, lambda x: x['edge_sidecar_to_children']['edges'],
+ list) or []
+ if edges:
+ entries = []
+ for edge_num, edge in enumerate(edges, start=1):
+ node = try_get(edge, lambda x: x['node'], dict)
+ if not node:
+ continue
+ node_video_url = url_or_none(node.get('video_url'))
+ if not node_video_url:
+ continue
+ entries.append({
+ 'id': node.get('shortcode') or node['id'],
+ 'title': 'Video %d' % edge_num,
+ 'url': node_video_url,
+ 'thumbnail': node.get('display_url'),
+ 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
+ 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
+ 'view_count': int_or_none(node.get('video_view_count')),
+ })
+ return self.playlist_result(
+ entries, video_id,
+ 'Post by %s' % uploader_id if uploader_id else None,
+ description)
if not video_url:
video_url = self._og_search_video_url(webpage, secure=False)
diff --git a/youtube_dlc/extractor/pornhub.py b/youtube_dlc/extractor/pornhub.py
index 9ad92a8ec..2fcbd186f 100644
--- a/youtube_dlc/extractor/pornhub.py
+++ b/youtube_dlc/extractor/pornhub.py
@@ -288,14 +288,24 @@ class PornHubIE(PornHubBaseIE):
video_urls.append((v_url, None))
video_urls_set.add(v_url)
+ def parse_quality_items(quality_items):
+ q_items = self._parse_json(quality_items, video_id, fatal=False)
+ if not isinstance(q_items, list):
+ return
+ for item in q_items:
+ if isinstance(item, dict):
+ add_video_url(item.get('url'))
+
if not video_urls:
- FORMAT_PREFIXES = ('media', 'quality')
+ FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
js_vars = extract_js_vars(
webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
default=None)
if js_vars:
for key, format_url in js_vars.items():
- if any(key.startswith(p) for p in FORMAT_PREFIXES):
+ if key.startswith(FORMAT_PREFIXES[-1]):
+ parse_quality_items(format_url)
+ elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
add_video_url(format_url)
if not video_urls and re.search(
r'<[^>]+\bid=["\']lockedPlayer', webpage):
@@ -351,12 +361,16 @@ class PornHubIE(PornHubBaseIE):
r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
webpage, 'uploader', default=None)
+ def extract_vote_count(kind, name):
+ return self._extract_count(
+ (r']+\bclass="votes%s"[^>]*>([\d,\.]+)' % kind,
+ r']+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind),
+ webpage, name)
+
view_count = self._extract_count(
r'([\d,\.]+) [Vv]iews', webpage, 'view')
- like_count = self._extract_count(
- r']+class="votesUp"[^>]*>([\d,\.]+)', webpage, 'like')
- dislike_count = self._extract_count(
- r']+class="votesDown"[^>]*>([\d,\.]+)', webpage, 'dislike')
+ like_count = extract_vote_count('Up', 'like')
+ dislike_count = extract_vote_count('Down', 'dislike')
comment_count = self._extract_count(
r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment')
diff --git a/youtube_dlc/extractor/spankbang.py b/youtube_dlc/extractor/spankbang.py
index 61ca902ce..37cb8c839 100644
--- a/youtube_dlc/extractor/spankbang.py
+++ b/youtube_dlc/extractor/spankbang.py
@@ -7,17 +7,24 @@ from ..utils import (
determine_ext,
ExtractorError,
merge_dicts,
- orderedSet,
parse_duration,
parse_resolution,
str_to_int,
url_or_none,
urlencode_postdata,
+ urljoin,
)
class SpankBangIE(InfoExtractor):
- _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/(?:video|play|embed)\b'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:[^/]+\.)?spankbang\.com/
+ (?:
+ (?P[\da-z]+)/(?:video|play|embed)\b|
+ [\da-z]+-(?P[\da-z]+)/playlist/[^/?#&]+
+ )
+ '''
_TESTS = [{
'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
'md5': '1cc433e1d6aa14bc376535b8679302f7',
@@ -57,10 +64,14 @@ class SpankBangIE(InfoExtractor):
}, {
'url': 'https://spankbang.com/2y3td/embed/',
'only_matching': True,
+ }, {
+ 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('id_2')
webpage = self._download_webpage(
url.replace('/%s/embed' % video_id, '/%s/video' % video_id),
video_id, headers={'Cookie': 'country=US'})
@@ -155,30 +166,33 @@ class SpankBangIE(InfoExtractor):
class SpankBangPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/[^/]+'
+ _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/(?P[^/]+)'
_TEST = {
'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties',
'info_dict': {
'id': 'ug0k',
'title': 'Big Ass Titties',
},
- 'playlist_mincount': 50,
+ 'playlist_mincount': 40,
}
def _real_extract(self, url):
- playlist_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+ display_id = mobj.group('display_id')
webpage = self._download_webpage(
url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
entries = [self.url_result(
- 'https://spankbang.com/%s/video' % video_id,
- ie=SpankBangIE.ie_key(), video_id=video_id)
- for video_id in orderedSet(re.findall(
- r']+\bhref=["\']/?([\da-z]+)/play/', webpage))]
+ urljoin(url, mobj.group('path')),
+ ie=SpankBangIE.ie_key(), video_id=mobj.group('id'))
+ for mobj in re.finditer(
+ r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/%s(?:(?!\1).)*)\1'
+ % re.escape(display_id), webpage)]
title = self._html_search_regex(
- r'([^<]+)\s+playlist
', webpage, 'playlist title',
+ r'([^<]+)\s+playlist\s*<', webpage, 'playlist title',
fatal=False)
return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dlc/extractor/sprout.py b/youtube_dlc/extractor/sprout.py
index 8467bf49d..e243732f2 100644
--- a/youtube_dlc/extractor/sprout.py
+++ b/youtube_dlc/extractor/sprout.py
@@ -3,50 +3,62 @@ from __future__ import unicode_literals
from .adobepass import AdobePassIE
from ..utils import (
- extract_attributes,
- update_url_query,
+ int_or_none,
smuggle_url,
+ update_url_query,
)
class SproutIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P[^/?#]+)'
- _TEST = {
- 'url': 'http://www.sproutonline.com/watch/cowboy-adventure',
- 'md5': '74bf14128578d1e040c3ebc82088f45f',
+ _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race',
'info_dict': {
- 'id': '9dexnwtmh8_X',
+ 'id': 'bm0foJFaTKqb',
'ext': 'mp4',
- 'title': 'A Cowboy Adventure',
- 'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.',
- 'timestamp': 1437758640,
- 'upload_date': '20150724',
- 'uploader': 'NBCU-SPROUT-NEW',
- }
- }
+ 'title': 'Robot Bike Race',
+ 'description': 'md5:436b1d97117cc437f54c383f4debc66d',
+ 'timestamp': 1606148940,
+ 'upload_date': '20201123',
+ 'uploader': 'NBCU-MPAT',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.sproutonline.com/watch/cowboy-adventure',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.universalkids.com/watch/robot-bike-race',
+ 'only_matching': True,
+ }]
+ _GEO_COUNTRIES = ['US']
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_component = self._search_regex(
- r'(?s)(]+data-component="video"[^>]*?>)',
- webpage, 'video component', default=None)
- if video_component:
- options = self._parse_json(extract_attributes(
- video_component)['data-options'], video_id)
- theplatform_url = options['video']
- query = {
- 'mbr': 'true',
- 'manifest': 'm3u',
- }
- if options.get('protected'):
- query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout')
- theplatform_url = smuggle_url(update_url_query(
- theplatform_url, query), {'force_smil_url': True})
- else:
- iframe = self._search_regex(
- r'(