From 42393ce234c651aaae244e1546e1803101765acc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 17 Dec 2013 12:33:55 +0100 Subject: [PATCH] Add support for direct links to a video (#1973) --- youtube_dl/extractor/generic.py | 52 ++++++++++++++++++++++++++------- youtube_dl/utils.py | 5 ++++ 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index da933067a3..209f68204e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -13,6 +13,8 @@ from ..utils import ( ExtractorError, smuggle_url, unescapeHTML, + unified_strdate, + url_basename, ) from .brightcove import BrightcoveIE @@ -71,6 +73,17 @@ class GenericIE(InfoExtractor): u'skip_download': True, }, }, + # Direct link to a video + { + u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4', + u'file': u'trailer.mp4', + u'md5': u'67d406c2bcb6af27fa886f31aa934bbe', + u'info_dict': { + u'id': u'trailer', + u'title': u'trailer', + u'upload_date': u'20100513', + } + } ] def report_download_webpage(self, video_id): @@ -83,7 +96,7 @@ class GenericIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) - def _test_redirect(self, url): + def _send_head(self, url): """Check if it is a redirect, like url shorteners, in case return the new url.""" class HeadRequest(compat_urllib_request.Request): def get_method(self): @@ -131,29 +144,46 @@ class GenericIE(InfoExtractor): response = opener.open(HeadRequest(url)) if response is None: raise ExtractorError(u'Invalid URL protocol') - new_url = response.geturl() - - if url == new_url: - return False - - self.report_following_redirect(new_url) - return new_url + return response def _real_extract(self, url): parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) + video_id = os.path.splitext(url.split('/')[-1])[0] try: - new_url = self._test_redirect(url) - if new_url: + response = self._send_head(url) + + # Check for redirect + new_url = response.geturl() + if url != new_url: + self.report_following_redirect(new_url) return self.url_result(new_url) + + # Check for direct link to a video + content_type = response.headers.get('Content-Type', '') + m = re.match(r'^(?:audio|video)/(?P.+)$', content_type) + if m: + upload_date = response.headers.get('Last-Modified') + if upload_date: + upload_date = unified_strdate(upload_date) + assert (url_basename(url) == 'trailer.mp4') + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'formats': [{ + 'format_id': m.group('format_id'), + 'url': url, + }], + 'upload_date': upload_date, + } + except compat_urllib_error.HTTPError: # This may be a stupid server that doesn't like HEAD, our UA, or so pass - video_id = url.split('/')[-1] try: webpage = self._download_webpage(url, video_id) except ValueError: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2d12e2df93..d5069dcca9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -761,12 +761,17 @@ def unified_strdate(date_str): '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S', ] for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') except: pass + if upload_date is None: + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') return upload_date def determine_ext(url, default_ext=u'unknown_video'):