|
|
@ -22,11 +22,15 @@ from ..utils import (
|
|
|
|
orderedSet,
|
|
|
|
orderedSet,
|
|
|
|
remove_quotes,
|
|
|
|
remove_quotes,
|
|
|
|
str_to_int,
|
|
|
|
str_to_int,
|
|
|
|
|
|
|
|
update_url_query,
|
|
|
|
|
|
|
|
urlencode_postdata,
|
|
|
|
url_or_none,
|
|
|
|
url_or_none,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PornHubBaseIE(InfoExtractor):
|
|
|
|
class PornHubBaseIE(InfoExtractor):
|
|
|
|
|
|
|
|
_NETRC_MACHINE = 'pornhub'
|
|
|
|
|
|
|
|
|
|
|
|
def _download_webpage_handle(self, *args, **kwargs):
|
|
|
|
def _download_webpage_handle(self, *args, **kwargs):
|
|
|
|
def dl(*args, **kwargs):
|
|
|
|
def dl(*args, **kwargs):
|
|
|
|
return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
|
|
|
|
return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
|
|
|
@ -52,6 +56,66 @@ class PornHubBaseIE(InfoExtractor):
|
|
|
|
|
|
|
|
|
|
|
|
return webpage, urlh
|
|
|
|
return webpage, urlh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _real_initialize(self):
|
|
|
|
|
|
|
|
self._logged_in = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _login(self, host):
|
|
|
|
|
|
|
|
if self._logged_in:
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
site = host.split('.')[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Both sites pornhub and pornhubpremium have separate accounts
|
|
|
|
|
|
|
|
# so there should be an option to provide credentials for both.
|
|
|
|
|
|
|
|
# At the same time some videos are available under the same video id
|
|
|
|
|
|
|
|
# on both sites so that we have to identify them as the same video.
|
|
|
|
|
|
|
|
# For that purpose we have to keep both in the same extractor
|
|
|
|
|
|
|
|
# but under different netrc machines.
|
|
|
|
|
|
|
|
username, password = self._get_login_info(netrc_machine=site)
|
|
|
|
|
|
|
|
if username is None:
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '')
|
|
|
|
|
|
|
|
login_page = self._download_webpage(
|
|
|
|
|
|
|
|
login_url, None, 'Downloading %s login page' % site)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_logged(webpage):
|
|
|
|
|
|
|
|
return any(re.search(p, webpage) for p in (
|
|
|
|
|
|
|
|
r'class=["\']signOut',
|
|
|
|
|
|
|
|
r'>Sign\s+[Oo]ut\s*<'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if is_logged(login_page):
|
|
|
|
|
|
|
|
self._logged_in = True
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
login_form = self._hidden_inputs(login_page)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
login_form.update({
|
|
|
|
|
|
|
|
'username': username,
|
|
|
|
|
|
|
|
'password': password,
|
|
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = self._download_json(
|
|
|
|
|
|
|
|
'https://www.%s/front/authenticate' % host, None,
|
|
|
|
|
|
|
|
'Logging in to %s' % site,
|
|
|
|
|
|
|
|
data=urlencode_postdata(login_form),
|
|
|
|
|
|
|
|
headers={
|
|
|
|
|
|
|
|
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
|
|
|
|
|
|
|
'Referer': login_url,
|
|
|
|
|
|
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if response.get('success') == '1':
|
|
|
|
|
|
|
|
self._logged_in = True
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
message = response.get('message')
|
|
|
|
|
|
|
|
if message is not None:
|
|
|
|
|
|
|
|
raise ExtractorError(
|
|
|
|
|
|
|
|
'Unable to login: %s' % message, expected=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
raise ExtractorError('Unable to log in')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PornHubIE(PornHubBaseIE):
|
|
|
|
class PornHubIE(PornHubBaseIE):
|
|
|
|
IE_DESC = 'PornHub and Thumbzilla'
|
|
|
|
IE_DESC = 'PornHub and Thumbzilla'
|
|
|
@ -163,12 +227,20 @@ class PornHubIE(PornHubBaseIE):
|
|
|
|
}, {
|
|
|
|
}, {
|
|
|
|
'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
|
|
|
|
'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
|
|
|
|
'only_matching': True,
|
|
|
|
'only_matching': True,
|
|
|
|
|
|
|
|
}, {
|
|
|
|
|
|
|
|
# Some videos are available with the same id on both premium
|
|
|
|
|
|
|
|
# and non-premium sites (e.g. this and the following test)
|
|
|
|
|
|
|
|
'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3',
|
|
|
|
|
|
|
|
'only_matching': True,
|
|
|
|
|
|
|
|
}, {
|
|
|
|
|
|
|
|
'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
|
|
|
|
|
|
|
|
'only_matching': True,
|
|
|
|
}]
|
|
|
|
}]
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@staticmethod
|
|
|
|
def _extract_urls(webpage):
|
|
|
|
def _extract_urls(webpage):
|
|
|
|
return re.findall(
|
|
|
|
return re.findall(
|
|
|
|
r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)',
|
|
|
|
r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)',
|
|
|
|
webpage)
|
|
|
|
webpage)
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_count(self, pattern, webpage, name):
|
|
|
|
def _extract_count(self, pattern, webpage, name):
|
|
|
@ -180,12 +252,7 @@ class PornHubIE(PornHubBaseIE):
|
|
|
|
host = mobj.group('host') or 'pornhub.com'
|
|
|
|
host = mobj.group('host') or 'pornhub.com'
|
|
|
|
video_id = mobj.group('id')
|
|
|
|
video_id = mobj.group('id')
|
|
|
|
|
|
|
|
|
|
|
|
if 'premium' in host:
|
|
|
|
self._login(host)
|
|
|
|
if not self._downloader.params.get('cookiefile'):
|
|
|
|
|
|
|
|
raise ExtractorError(
|
|
|
|
|
|
|
|
'PornHub Premium requires authentication.'
|
|
|
|
|
|
|
|
' You may want to use --cookies.',
|
|
|
|
|
|
|
|
expected=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._set_cookie(host, 'age_verified', '1')
|
|
|
|
self._set_cookie(host, 'age_verified', '1')
|
|
|
|
|
|
|
|
|
|
|
@ -405,6 +472,10 @@ class PornHubIE(PornHubBaseIE):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PornHubPlaylistBaseIE(PornHubBaseIE):
|
|
|
|
class PornHubPlaylistBaseIE(PornHubBaseIE):
|
|
|
|
|
|
|
|
def _extract_page(self, url):
|
|
|
|
|
|
|
|
return int_or_none(self._search_regex(
|
|
|
|
|
|
|
|
r'\bpage=(\d+)', url, 'page', default=None))
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_entries(self, webpage, host):
|
|
|
|
def _extract_entries(self, webpage, host):
|
|
|
|
# Only process container div with main playlist content skipping
|
|
|
|
# Only process container div with main playlist content skipping
|
|
|
|
# drop-down menu that uses similar pattern for videos (see
|
|
|
|
# drop-down menu that uses similar pattern for videos (see
|
|
|
@ -422,26 +493,6 @@ class PornHubPlaylistBaseIE(PornHubBaseIE):
|
|
|
|
container))
|
|
|
|
container))
|
|
|
|
]
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def _real_extract(self, url):
|
|
|
|
|
|
|
|
mobj = re.match(self._VALID_URL, url)
|
|
|
|
|
|
|
|
host = mobj.group('host')
|
|
|
|
|
|
|
|
playlist_id = mobj.group('id')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
webpage = self._download_webpage(url, playlist_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
entries = self._extract_entries(webpage, host)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
playlist = self._parse_json(
|
|
|
|
|
|
|
|
self._search_regex(
|
|
|
|
|
|
|
|
r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
|
|
|
|
|
|
|
|
'playlist', default='{}'),
|
|
|
|
|
|
|
|
playlist_id, fatal=False)
|
|
|
|
|
|
|
|
title = playlist.get('title') or self._search_regex(
|
|
|
|
|
|
|
|
r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return self.playlist_result(
|
|
|
|
|
|
|
|
entries, playlist_id, title, playlist.get('description'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PornHubUserIE(PornHubPlaylistBaseIE):
|
|
|
|
class PornHubUserIE(PornHubPlaylistBaseIE):
|
|
|
|
_VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
|
|
|
|
_VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
|
|
|
@ -463,14 +514,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
|
|
|
|
}, {
|
|
|
|
}, {
|
|
|
|
'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
|
|
|
|
'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
|
|
|
|
'only_matching': True,
|
|
|
|
'only_matching': True,
|
|
|
|
|
|
|
|
}, {
|
|
|
|
|
|
|
|
# Unavailable via /videos page, but available with direct pagination
|
|
|
|
|
|
|
|
# on pornstar page (see [1]), requires premium
|
|
|
|
|
|
|
|
# 1. https://github.com/ytdl-org/youtube-dl/issues/27853
|
|
|
|
|
|
|
|
'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
|
|
|
|
|
|
|
|
'only_matching': True,
|
|
|
|
|
|
|
|
}, {
|
|
|
|
|
|
|
|
# Same as before, multi page
|
|
|
|
|
|
|
|
'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
|
|
|
|
|
|
|
|
'only_matching': True,
|
|
|
|
}]
|
|
|
|
}]
|
|
|
|
|
|
|
|
|
|
|
|
def _real_extract(self, url):
|
|
|
|
def _real_extract(self, url):
|
|
|
|
mobj = re.match(self._VALID_URL, url)
|
|
|
|
mobj = re.match(self._VALID_URL, url)
|
|
|
|
user_id = mobj.group('id')
|
|
|
|
user_id = mobj.group('id')
|
|
|
|
|
|
|
|
videos_url = '%s/videos' % mobj.group('url')
|
|
|
|
|
|
|
|
page = self._extract_page(url)
|
|
|
|
|
|
|
|
if page:
|
|
|
|
|
|
|
|
videos_url = update_url_query(videos_url, {'page': page})
|
|
|
|
return self.url_result(
|
|
|
|
return self.url_result(
|
|
|
|
'%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(),
|
|
|
|
videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
|
|
|
|
video_id=user_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
|
|
|
|
class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
|
|
|
@ -483,32 +547,55 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
|
|
|
|
<button[^>]+\bid=["\']moreDataBtn
|
|
|
|
<button[^>]+\bid=["\']moreDataBtn
|
|
|
|
''', webpage) is not None
|
|
|
|
''', webpage) is not None
|
|
|
|
|
|
|
|
|
|
|
|
def _real_extract(self, url):
|
|
|
|
def _entries(self, url, host, item_id):
|
|
|
|
mobj = re.match(self._VALID_URL, url)
|
|
|
|
page = self._extract_page(url)
|
|
|
|
host = mobj.group('host')
|
|
|
|
|
|
|
|
item_id = mobj.group('id')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
page = int_or_none(self._search_regex(
|
|
|
|
VIDEOS = '/videos'
|
|
|
|
r'\bpage=(\d+)', url, 'page', default=None))
|
|
|
|
|
|
|
|
|
|
|
|
def download_page(base_url, num, fallback=False):
|
|
|
|
|
|
|
|
note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '')
|
|
|
|
|
|
|
|
return self._download_webpage(
|
|
|
|
|
|
|
|
base_url, item_id, note, query={'page': num})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_404(e):
|
|
|
|
|
|
|
|
return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404
|
|
|
|
|
|
|
|
|
|
|
|
entries = []
|
|
|
|
base_url = url
|
|
|
|
for page_num in (page, ) if page is not None else itertools.count(1):
|
|
|
|
has_page = page is not None
|
|
|
|
|
|
|
|
first_page = page if has_page else 1
|
|
|
|
|
|
|
|
for page_num in (first_page, ) if has_page else itertools.count(first_page):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
webpage = self._download_webpage(
|
|
|
|
try:
|
|
|
|
url, item_id, 'Downloading page %d' % page_num,
|
|
|
|
webpage = download_page(base_url, page_num)
|
|
|
|
query={'page': page_num})
|
|
|
|
except ExtractorError as e:
|
|
|
|
|
|
|
|
# Some sources may not be available via /videos page,
|
|
|
|
|
|
|
|
# trying to fallback to main page pagination (see [1])
|
|
|
|
|
|
|
|
# 1. https://github.com/ytdl-org/youtube-dl/issues/27853
|
|
|
|
|
|
|
|
if is_404(e) and page_num == first_page and VIDEOS in base_url:
|
|
|
|
|
|
|
|
base_url = base_url.replace(VIDEOS, '')
|
|
|
|
|
|
|
|
webpage = download_page(base_url, page_num, fallback=True)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
raise
|
|
|
|
except ExtractorError as e:
|
|
|
|
except ExtractorError as e:
|
|
|
|
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
|
|
|
|
if is_404(e) and page_num != first_page:
|
|
|
|
break
|
|
|
|
break
|
|
|
|
raise
|
|
|
|
raise
|
|
|
|
page_entries = self._extract_entries(webpage, host)
|
|
|
|
page_entries = self._extract_entries(webpage, host)
|
|
|
|
if not page_entries:
|
|
|
|
if not page_entries:
|
|
|
|
break
|
|
|
|
break
|
|
|
|
entries.extend(page_entries)
|
|
|
|
for e in page_entries:
|
|
|
|
|
|
|
|
yield e
|
|
|
|
if not self._has_more(webpage):
|
|
|
|
if not self._has_more(webpage):
|
|
|
|
break
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
return self.playlist_result(orderedSet(entries), item_id)
|
|
|
|
def _real_extract(self, url):
|
|
|
|
|
|
|
|
mobj = re.match(self._VALID_URL, url)
|
|
|
|
|
|
|
|
host = mobj.group('host')
|
|
|
|
|
|
|
|
item_id = mobj.group('id')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._login(host)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return self.playlist_result(self._entries(url, host, item_id), item_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
|
|
|
|
class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
|
|
|
|