|
|
|
@ -398,6 +398,10 @@ class FileDownloader(object):
|
|
|
|
|
print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
|
|
|
|
|
if self.params.get('forceurl', False):
|
|
|
|
|
print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
|
|
|
|
|
if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
|
|
|
|
|
print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
|
|
|
|
|
if self.params.get('forcedescription', False) and 'description' in info_dict:
|
|
|
|
|
print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
|
|
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
@ -599,6 +603,14 @@ class InfoExtractor(object):
|
|
|
|
|
ext: Video filename extension.
|
|
|
|
|
format: Video format.
|
|
|
|
|
|
|
|
|
|
The following fields are optional. Their primary purpose is to allow
|
|
|
|
|
youtube-dl to serve as the backend for a video search function, such
|
|
|
|
|
as the one in youtube2mp3. They are only used when their respective
|
|
|
|
|
forced printing functions are called:
|
|
|
|
|
|
|
|
|
|
thumbnail: Full URL to a video thumbnail image.
|
|
|
|
|
description: One-line video description.
|
|
|
|
|
|
|
|
|
|
Subclasses of this one should re-define the _real_initialize() and
|
|
|
|
|
_real_extract() methods, as well as the suitable() static method.
|
|
|
|
|
Probably, they should also be instantiated and added to the main
|
|
|
|
@ -842,6 +854,28 @@ class YoutubeIE(InfoExtractor):
|
|
|
|
|
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
|
|
|
|
|
simple_title = simple_title.strip(ur'_')
|
|
|
|
|
|
|
|
|
|
# thumbnail image
|
|
|
|
|
if 'thumbnail_url' not in video_info:
|
|
|
|
|
self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
|
|
|
|
|
video_thumbnail = ''
|
|
|
|
|
else: # don't panic if we can't find it
|
|
|
|
|
video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
|
|
|
|
|
|
|
|
|
|
# get video description
|
|
|
|
|
video_description = 'No description available.' # we need something to pass to self._downloader
|
|
|
|
|
# this requires an additional HTTP request and a little
|
|
|
|
|
# more time, so don't do it unless absolutely necessary
|
|
|
|
|
if self._downloader.params.get('forcedescription', False):
|
|
|
|
|
video_page_url = 'http://www.youtube.com/watch?v=' + video_id
|
|
|
|
|
request = urllib2.Request(video_page_url, None, std_headers)
|
|
|
|
|
try:
|
|
|
|
|
video_page_webpage = urllib2.urlopen(request).read()
|
|
|
|
|
mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
|
|
|
|
|
if mobj is not None:
|
|
|
|
|
video_description = mobj.group(1)
|
|
|
|
|
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
|
|
|
|
|
pass # don't panic if we can't find it
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Process video information
|
|
|
|
|
self._downloader.process_info({
|
|
|
|
@ -852,6 +886,8 @@ class YoutubeIE(InfoExtractor):
|
|
|
|
|
'stitle': simple_title,
|
|
|
|
|
'ext': video_extension.decode('utf-8'),
|
|
|
|
|
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
|
|
|
|
|
'thumbnail': video_thumbnail.decode('utf-8'),
|
|
|
|
|
'description': video_description.decode('utf-8'),
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if all_formats:
|
|
|
|
@ -1080,6 +1116,32 @@ class GoogleIE(InfoExtractor):
|
|
|
|
|
video_title = sanitize_title(video_title)
|
|
|
|
|
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
|
|
|
|
|
|
|
|
|
|
# Extract video description
|
|
|
|
|
mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
|
|
|
|
|
if mobj is None:
|
|
|
|
|
self._downloader.trouble(u'ERROR: unable to extract video description')
|
|
|
|
|
return
|
|
|
|
|
video_description = mobj.group(1).decode('utf-8')
|
|
|
|
|
if not video_description:
|
|
|
|
|
video_description = 'No description available.'
|
|
|
|
|
|
|
|
|
|
# Extract video thumbnail
|
|
|
|
|
if self._downloader.params.get('forcethumbnail', False):
|
|
|
|
|
request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
|
|
|
|
|
try:
|
|
|
|
|
webpage = urllib2.urlopen(request).read()
|
|
|
|
|
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
|
|
|
|
|
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
|
|
|
|
|
return
|
|
|
|
|
mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
|
|
|
|
|
if mobj is None:
|
|
|
|
|
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
|
|
|
|
|
return
|
|
|
|
|
video_thumbnail = mobj.group(1)
|
|
|
|
|
else: # we need something to pass to process_info
|
|
|
|
|
video_thumbnail = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Process video information
|
|
|
|
|
self._downloader.process_info({
|
|
|
|
@ -1258,6 +1320,21 @@ class YahooIE(InfoExtractor):
|
|
|
|
|
return
|
|
|
|
|
video_uploader = mobj.group(1).decode('utf-8')
|
|
|
|
|
|
|
|
|
|
# Extract video thumbnail
|
|
|
|
|
mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
|
|
|
|
|
if mobj is None:
|
|
|
|
|
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
|
|
|
|
|
return
|
|
|
|
|
video_thumbnail = mobj.group(1).decode('utf-8')
|
|
|
|
|
|
|
|
|
|
# Extract video description
|
|
|
|
|
mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
|
|
|
|
|
if mobj is None:
|
|
|
|
|
self._downloader.trouble(u'ERROR: unable to extract video description')
|
|
|
|
|
return
|
|
|
|
|
video_description = mobj.group(1).decode('utf-8')
|
|
|
|
|
if not video_description: video_description = 'No description available.'
|
|
|
|
|
|
|
|
|
|
# Extract video height and width
|
|
|
|
|
mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
|
|
|
|
|
if mobj is None:
|
|
|
|
@ -1303,6 +1380,10 @@ class YahooIE(InfoExtractor):
|
|
|
|
|
'title': video_title,
|
|
|
|
|
'stitle': simple_title,
|
|
|
|
|
'ext': video_extension.decode('utf-8'),
|
|
|
|
|
'thumbnail': video_thumbnail.decode('utf-8'),
|
|
|
|
|
'description': video_description,
|
|
|
|
|
'thumbnail': video_thumbnail,
|
|
|
|
|
'description': video_description,
|
|
|
|
|
})
|
|
|
|
|
except UnavailableFormatError:
|
|
|
|
|
self._downloader.trouble(u'ERROR: format not available for video')
|
|
|
|
@ -1494,6 +1575,188 @@ class YoutubeSearchIE(InfoExtractor):
|
|
|
|
|
|
|
|
|
|
pagenum = pagenum + 1
|
|
|
|
|
|
|
|
|
|
class GoogleSearchIE(InfoExtractor):
|
|
|
|
|
"""Information Extractor for Google Video search queries."""
|
|
|
|
|
_VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
|
|
|
|
|
_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
|
|
|
|
|
_VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
|
|
|
|
|
_MORE_PAGES_INDICATOR = r'<span>Next</span>'
|
|
|
|
|
_google_ie = None
|
|
|
|
|
_max_google_results = 1000
|
|
|
|
|
|
|
|
|
|
def __init__(self, google_ie, downloader=None):
|
|
|
|
|
InfoExtractor.__init__(self, downloader)
|
|
|
|
|
self._google_ie = google_ie
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def suitable(url):
|
|
|
|
|
return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
|
|
|
|
|
|
|
|
|
|
def report_download_page(self, query, pagenum):
|
|
|
|
|
"""Report attempt to download playlist page with given number."""
|
|
|
|
|
query = query.decode(preferredencoding())
|
|
|
|
|
self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
|
|
|
|
|
|
|
|
|
|
def _real_initialize(self):
|
|
|
|
|
self._google_ie.initialize()
|
|
|
|
|
|
|
|
|
|
def _real_extract(self, query):
|
|
|
|
|
mobj = re.match(self._VALID_QUERY, query)
|
|
|
|
|
if mobj is None:
|
|
|
|
|
self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
prefix, query = query.split(':')
|
|
|
|
|
prefix = prefix[8:]
|
|
|
|
|
query = query.encode('utf-8')
|
|
|
|
|
if prefix == '':
|
|
|
|
|
self._download_n_results(query, 1)
|
|
|
|
|
return
|
|
|
|
|
elif prefix == 'all':
|
|
|
|
|
self._download_n_results(query, self._max_google_results)
|
|
|
|
|
return
|
|
|
|
|
else:
|
|
|
|
|
try:
|
|
|
|
|
n = long(prefix)
|
|
|
|
|
if n <= 0:
|
|
|
|
|
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
|
|
|
|
|
return
|
|
|
|
|
elif n > self._max_google_results:
|
|
|
|
|
self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
|
|
|
|
|
n = self._max_google_results
|
|
|
|
|
self._download_n_results(query, n)
|
|
|
|
|
return
|
|
|
|
|
except ValueError: # parsing prefix as integer fails
|
|
|
|
|
self._download_n_results(query, 1)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
def _download_n_results(self, query, n):
|
|
|
|
|
"""Downloads a specified number of results for a query"""
|
|
|
|
|
|
|
|
|
|
video_ids = []
|
|
|
|
|
already_seen = set()
|
|
|
|
|
pagenum = 1
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
self.report_download_page(query, pagenum)
|
|
|
|
|
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
|
|
|
|
|
request = urllib2.Request(result_url, None, std_headers)
|
|
|
|
|
try:
|
|
|
|
|
page = urllib2.urlopen(request).read()
|
|
|
|
|
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
|
|
|
|
|
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Extract video identifiers
|
|
|
|
|
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
|
|
|
|
|
video_id = mobj.group(1)
|
|
|
|
|
if video_id not in already_seen:
|
|
|
|
|
video_ids.append(video_id)
|
|
|
|
|
already_seen.add(video_id)
|
|
|
|
|
if len(video_ids) == n:
|
|
|
|
|
# Specified n videos reached
|
|
|
|
|
for id in video_ids:
|
|
|
|
|
self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
|
|
|
|
|
for id in video_ids:
|
|
|
|
|
self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
pagenum = pagenum + 1
|
|
|
|
|
|
|
|
|
|
class YahooSearchIE(InfoExtractor):
|
|
|
|
|
"""Information Extractor for Yahoo! Video search queries."""
|
|
|
|
|
_VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
|
|
|
|
|
_TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
|
|
|
|
|
_VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
|
|
|
|
|
_MORE_PAGES_INDICATOR = r'\s*Next'
|
|
|
|
|
_yahoo_ie = None
|
|
|
|
|
_max_yahoo_results = 1000
|
|
|
|
|
|
|
|
|
|
def __init__(self, yahoo_ie, downloader=None):
|
|
|
|
|
InfoExtractor.__init__(self, downloader)
|
|
|
|
|
self._yahoo_ie = yahoo_ie
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def suitable(url):
|
|
|
|
|
return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
|
|
|
|
|
|
|
|
|
|
def report_download_page(self, query, pagenum):
|
|
|
|
|
"""Report attempt to download playlist page with given number."""
|
|
|
|
|
query = query.decode(preferredencoding())
|
|
|
|
|
self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
|
|
|
|
|
|
|
|
|
|
def _real_initialize(self):
|
|
|
|
|
self._yahoo_ie.initialize()
|
|
|
|
|
|
|
|
|
|
def _real_extract(self, query):
|
|
|
|
|
mobj = re.match(self._VALID_QUERY, query)
|
|
|
|
|
if mobj is None:
|
|
|
|
|
self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
prefix, query = query.split(':')
|
|
|
|
|
prefix = prefix[8:]
|
|
|
|
|
query = query.encode('utf-8')
|
|
|
|
|
if prefix == '':
|
|
|
|
|
self._download_n_results(query, 1)
|
|
|
|
|
return
|
|
|
|
|
elif prefix == 'all':
|
|
|
|
|
self._download_n_results(query, self._max_yahoo_results)
|
|
|
|
|
return
|
|
|
|
|
else:
|
|
|
|
|
try:
|
|
|
|
|
n = long(prefix)
|
|
|
|
|
if n <= 0:
|
|
|
|
|
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
|
|
|
|
|
return
|
|
|
|
|
elif n > self._max_yahoo_results:
|
|
|
|
|
self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
|
|
|
|
|
n = self._max_yahoo_results
|
|
|
|
|
self._download_n_results(query, n)
|
|
|
|
|
return
|
|
|
|
|
except ValueError: # parsing prefix as integer fails
|
|
|
|
|
self._download_n_results(query, 1)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
def _download_n_results(self, query, n):
|
|
|
|
|
"""Downloads a specified number of results for a query"""
|
|
|
|
|
|
|
|
|
|
video_ids = []
|
|
|
|
|
already_seen = set()
|
|
|
|
|
pagenum = 1
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
self.report_download_page(query, pagenum)
|
|
|
|
|
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
|
|
|
|
|
request = urllib2.Request(result_url, None, std_headers)
|
|
|
|
|
try:
|
|
|
|
|
page = urllib2.urlopen(request).read()
|
|
|
|
|
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
|
|
|
|
|
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Extract video identifiers
|
|
|
|
|
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
|
|
|
|
|
video_id = mobj.group(1)
|
|
|
|
|
if video_id not in already_seen:
|
|
|
|
|
video_ids.append(video_id)
|
|
|
|
|
already_seen.add(video_id)
|
|
|
|
|
if len(video_ids) == n:
|
|
|
|
|
# Specified n videos reached
|
|
|
|
|
for id in video_ids:
|
|
|
|
|
self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
|
|
|
|
|
for id in video_ids:
|
|
|
|
|
self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
pagenum = pagenum + 1
|
|
|
|
|
|
|
|
|
|
class YoutubePlaylistIE(InfoExtractor):
|
|
|
|
|
"""Information Extractor for YouTube playlists."""
|
|
|
|
|
|
|
|
|
@ -1732,6 +1995,10 @@ if __name__ == '__main__':
|
|
|
|
|
action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
|
|
|
|
|
verbosity.add_option('-e', '--get-title',
|
|
|
|
|
action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
|
|
|
|
|
verbosity.add_option('--get-thumbnail',
|
|
|
|
|
action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
|
|
|
|
|
verbosity.add_option('--get-description',
|
|
|
|
|
action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
|
|
|
|
|
verbosity.add_option('--no-progress',
|
|
|
|
|
action='store_true', dest='noprogress', help='do not print progress bar', default=False)
|
|
|
|
|
parser.add_option_group(verbosity)
|
|
|
|
@ -1788,8 +2055,10 @@ if __name__ == '__main__':
|
|
|
|
|
youtube_user_ie = YoutubeUserIE(youtube_ie)
|
|
|
|
|
youtube_search_ie = YoutubeSearchIE(youtube_ie)
|
|
|
|
|
google_ie = GoogleIE()
|
|
|
|
|
google_search_ie = GoogleSearchIE(google_ie)
|
|
|
|
|
photobucket_ie = PhotobucketIE()
|
|
|
|
|
yahoo_ie = YahooIE()
|
|
|
|
|
yahoo_search_ie = YahooSearchIE(yahoo_ie)
|
|
|
|
|
generic_ie = GenericIE()
|
|
|
|
|
|
|
|
|
|
# File downloader
|
|
|
|
@ -1797,10 +2066,12 @@ if __name__ == '__main__':
|
|
|
|
|
'usenetrc': opts.usenetrc,
|
|
|
|
|
'username': opts.username,
|
|
|
|
|
'password': opts.password,
|
|
|
|
|
'quiet': (opts.quiet or opts.geturl or opts.gettitle),
|
|
|
|
|
'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
|
|
|
|
|
'forceurl': opts.geturl,
|
|
|
|
|
'forcetitle': opts.gettitle,
|
|
|
|
|
'simulate': (opts.simulate or opts.geturl or opts.gettitle),
|
|
|
|
|
'forcethumbnail': opts.getthumbnail,
|
|
|
|
|
'forcedescription': opts.getdescription,
|
|
|
|
|
'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
|
|
|
|
|
'format': opts.format,
|
|
|
|
|
'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
|
|
|
|
|
or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
|
|
|
|
@ -1821,8 +2092,10 @@ if __name__ == '__main__':
|
|
|
|
|
fd.add_info_extractor(metacafe_ie)
|
|
|
|
|
fd.add_info_extractor(youtube_ie)
|
|
|
|
|
fd.add_info_extractor(google_ie)
|
|
|
|
|
fd.add_info_extractor(google_search_ie)
|
|
|
|
|
fd.add_info_extractor(photobucket_ie)
|
|
|
|
|
fd.add_info_extractor(yahoo_ie)
|
|
|
|
|
fd.add_info_extractor(yahoo_search_ie)
|
|
|
|
|
|
|
|
|
|
# This must come last since it's the
|
|
|
|
|
# fallback if none of the others work
|
|
|
|
|