From 968b5e0112a83f2a4637226d4d743b394ebed038 Mon Sep 17 00:00:00 2001 From: Dan Church Date: Sun, 4 Aug 2013 12:45:24 -0500 Subject: [PATCH 01/16] Add some verbosity when reporting finished downloads For example: [download] Resuming download at byte 1868140 [download] Destination: Entry #1-Bn59FJ4HrmU.flv [download] 100% of 3.27MiB in 4s This format is meant to somewhat mirror the behavior of wget(1) when reporting finished downloads: 100%[==================>] 54,836,682 788KB/s in 74s 2013-08-04 12:32:05 (728 KB/s) - 'google-chrome-stable_current_x86_64.rpm' saved [54836682/54836682] --- youtube_dl/FileDownloader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index ea6b9d626..ab06533c0 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -230,12 +230,14 @@ class FileDownloader(object): """Report it was impossible to resume download.""" self.to_screen(u'[download] Unable to resume') - def report_finish(self): + def report_finish(self, data_len_str, tot_time): """Report download finished.""" if self.params.get('noprogress', False): self.to_screen(u'[download] Download completed') else: - self.to_screen(u'') + clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + self.to_screen(u'\r%s[download] 100%% of %s in %ss' % + (clear_line, data_len_str, int(tot_time))) def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename) @@ -538,7 +540,7 @@ class FileDownloader(object): self.report_error(u'Did not get any data blocks') return False stream.close() - self.report_finish() + self.report_finish(data_len_str, (time.time() - start)) if data_len is not None and byte_counter != data_len: raise ContentTooShortError(byte_counter, int(data_len)) self.try_rename(tmpfilename, filename) From 461cead4f788f6a69902f350b9143a5e1588b57d Mon Sep 17 00:00:00 2001 From: tsantala Date: Tue, 6 Aug 2013 04:34:24 +0300 Subject: [PATCH 02/16] changes --- youtube_dl/extractor/AddAnime.py | 54 ++++++++++++++++++++++++++++++++ youtube_dl/extractor/__init__.py | 2 ++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/AddAnime.py diff --git a/youtube_dl/extractor/AddAnime.py b/youtube_dl/extractor/AddAnime.py new file mode 100644 index 000000000..43b0b24fe --- /dev/null +++ b/youtube_dl/extractor/AddAnime.py @@ -0,0 +1,54 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) +from bs4 import BeautifulSoup + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^(?:http?://)?(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'137499050692ced.flv', + u'md5': u'0813c2430bea7a46bf13acf3406992f4', + u'info_dict': { + u"description": u"One Piece 606", + u"uploader": u"mugiwaraQ8", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r'var normal_video_file = "(.*?)",', + webpage, u'video URL') + + video_title = self._og_search_title(webpage) + + video_description = self._og_search_description(webpage) + + soup = BeautifulSoup(webpage) + + video_uploader= soup.find("meta", {"author":""})['content'] + + info = { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader + } + + return [info] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 84c02c2ed..28dcb2cc4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,5 @@ + +from .AddAnime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE From 5a27ecdd2ec83ba6e1069428c4c0fb3bd61f638c Mon Sep 17 00:00:00 2001 From: kkalpakloglou Date: Fri, 16 Aug 2013 23:54:09 +0300 Subject: [PATCH 03/16] Update AddAnime.py --- youtube_dl/extractor/AddAnime.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/AddAnime.py b/youtube_dl/extractor/AddAnime.py index 43b0b24fe..a312fa97e 100644 --- a/youtube_dl/extractor/AddAnime.py +++ b/youtube_dl/extractor/AddAnime.py @@ -1,11 +1,6 @@ import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) -from bs4 import BeautifulSoup - class AddAnimeIE(InfoExtractor): @@ -17,7 +12,6 @@ class AddAnimeIE(InfoExtractor): u'md5': u'0813c2430bea7a46bf13acf3406992f4', u'info_dict': { u"description": u"One Piece 606", - u"uploader": u"mugiwaraQ8", u"title": u"One Piece 606" } } @@ -31,24 +25,27 @@ class AddAnimeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'var normal_video_file = "(.*?)",', - webpage, u'video URL') + + def find_between( webpage, first, last ): + try: + start = webpage.index( first ) + len( first ) + end = webpage.index( last, start ) + return webpage[start:end] + except ValueError: + return "" + + video_url = find_between( webpage, "var normal_video_file = '", "';" ) video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) - - soup = BeautifulSoup(webpage) - - video_uploader= soup.find("meta", {"author":""})['content'] info = { 'id': video_id, 'url': video_url, 'ext': 'flv', 'title': video_title, - 'description': video_description, - 'uploader': video_uploader + 'description': video_description } return [info] From 1a582dd49d628914fa6a056b490914738f15c56d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 27 Aug 2013 11:56:48 +0200 Subject: [PATCH 04/16] Add an extractor for CNN (closes #1318) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cnn.py | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/cnn.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index eeeb3db50..ea2af0d0e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,6 +9,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py new file mode 100644 index 000000000..cee78765b --- /dev/null +++ b/youtube_dl/extractor/cnn.py @@ -0,0 +1,47 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + +class CNNIE(InfoExtractor): + _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P.+?/(?P[^/]+?)\.cnn)' + + _TEST = { + u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', + u'md5': u'3e6121ea48df7e2259fe73a0628605c4', + u'info_dict': { + u'title': u'Nadal wins 8th French Open title', + u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + path = mobj.group('path') + page_title = mobj.group('title') + info_xml = self._download_webpage( + 'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + formats = [] + for f in info.findall('files/file'): + mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) + if mf is not None: + formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) + formats = sorted(formats) + (_,_,_, video_path) = formats[-1] + video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + + thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) + thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] + + return {'id': info.attrib['id'], + 'title': info.find('headline').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': thumbnails[-1][1], + 'thumbnails': thumbs_dict, + 'description': info.find('description').text, + } From 0bc56fa66a4b0f1b6bf827bd3550a119d3e3b231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 27 Aug 2013 12:38:30 +0200 Subject: [PATCH 05/16] Add an extractor for NBC news (closes #1320) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/nbc.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/nbc.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ea2af0d0e..27bbcc0f7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,7 @@ from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE +from .nbc import NBCNewsIE from .ooyala import OoyalaIE from .pbs import PBSIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py new file mode 100644 index 000000000..3bc9dae6d --- /dev/null +++ b/youtube_dl/extractor/nbc.py @@ -0,0 +1,33 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import find_xpath_attr, compat_str + + +class NBCNewsIE(InfoExtractor): + _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', + u'file': u'52753292.flv', + u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', + u'info_dict': { + u'title': u'Crew emerges after four-month Mars food study', + u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + + return {'id': video_id, + 'title': info.find('headline').text, + 'ext': 'flv', + 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, + 'description': compat_str(info.find('caption').text), + 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, + } From 7f3c4f4f65ddb4f8374b31b74428780e60a373de Mon Sep 17 00:00:00 2001 From: Jeff Smith <whydoubt@yahoo.com> Date: Tue, 27 Aug 2013 14:38:50 -0500 Subject: [PATCH 06/16] Initial slash in Google+ photos link was removed --- youtube_dl/extractor/googleplus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 9f7fc19a4..f1cd88983 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor): webpage, 'title', default=u'NA') # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com' - video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN), + DOMAIN = 'https://plus.google.com/' + video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), webpage, u'video page URL') if not video_page.startswith(DOMAIN): video_page = DOMAIN + video_page From acebc9cd6bd713c518e80d00af13e3120614e115 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 23:15:01 +0200 Subject: [PATCH 07/16] Revert "Install our own HTTPS handler as well (#1309)" This reverts commit 36399e85765a6a04fd84126264af75382fcfd1f6 and fixes #1322. --- youtube_dl/utils.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ab1049cc0..52cfb8a6d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -476,7 +476,7 @@ def formatSeconds(secs): def make_HTTPS_handler(opts): if sys.version_info < (3,2): # Python's 2.x handler is very simplistic - return YoutubeDLHandlerHTTPS() + return compat_urllib_request.HTTPSHandler() else: import ssl context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) @@ -485,7 +485,7 @@ def make_HTTPS_handler(opts): context.verify_mode = (ssl.CERT_NONE if opts.no_check_certificate else ssl.CERT_REQUIRED) - return YoutubeDLHandlerHTTPS(context=context) + return compat_urllib_request.HTTPSHandler(context=context) class ExtractorError(Exception): """Error during info extraction.""" @@ -569,8 +569,7 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected - -class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -603,8 +602,8 @@ class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler ret.code = code return ret - def _http_request(self, req): - for h, v in std_headers.items(): + def http_request(self, req): + for h,v in std_headers.items(): if h in req.headers: del req.headers[h] req.add_header(h, v) @@ -619,7 +618,7 @@ class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler del req.headers['Youtubedl-user-agent'] return req - def _http_response(self, req, resp): + def http_response(self, req, resp): old_resp = resp # gzip if resp.headers.get('Content-encoding', '') == 'gzip': @@ -633,16 +632,8 @@ class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler resp.msg = old_resp.msg return resp - -class YoutubeDLHandler(YoutubeDLHandler_Template, compat_urllib_request.HTTPHandler): - http_request = YoutubeDLHandler_Template._http_request - http_response = YoutubeDLHandler_Template._http_response - - -class YoutubeDLHandlerHTTPS(YoutubeDLHandler_Template, compat_urllib_request.HTTPSHandler): - https_request = YoutubeDLHandler_Template._http_request - https_response = YoutubeDLHandler_Template._http_response - + https_request = http_request + https_response = http_response def unified_strdate(date_str): """Return a string with the date in the format YYYYMMDD""" From 88a79ce6a6edb6d3f8b6e3319f6ca1d3d2954c16 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 23:31:24 +0200 Subject: [PATCH 08/16] Delete default user agent (Fixes #1309) --- youtube_dl/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 614429073..bc6a6d180 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -430,6 +430,10 @@ def _real_main(argv=None): proxy_handler = compat_urllib_request.ProxyHandler(proxies) https_handler = make_HTTPS_handler(opts) opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders =[] compat_urllib_request.install_opener(opener) socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) From 1619e22f40883ba3c39f4d2a020cba3a1eebd34f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 23:31:36 +0200 Subject: [PATCH 09/16] release 2013.08.28 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index dff568640..0b56e48dc 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.27' +__version__ = '2013.08.28' From 273f603efb2028a54e04cca314b72bc2a9d767ef Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 00:14:19 +0200 Subject: [PATCH 10/16] [cnn] Allow more URLs --- youtube_dl/extractor/cnn.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index cee78765b..4338bd180 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -4,10 +4,12 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext + class CNNIE(InfoExtractor): - _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P<path>.+?/(?P<title>[^/]+?)\.cnn)' + _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ + (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' - _TEST = { + _TESTS = [{ u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', u'md5': u'3e6121ea48df7e2259fe73a0628605c4', @@ -15,14 +17,24 @@ class CNNIE(InfoExtractor): u'title': u'Nadal wins 8th French Open title', u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', }, - } + }, + { + u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", + u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", + u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e", + u"info_dict": { + u"title": "Student's epic speech stuns new freshmen", + u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"" + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) path = mobj.group('path') page_title = mobj.group('title') - info_xml = self._download_webpage( - 'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title) + info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path + print(info_url) + info_xml = self._download_webpage(info_url, page_title) info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) formats = [] From 44586389e4676dfd926255cf76e36684dcf4742d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 02:18:44 +0200 Subject: [PATCH 11/16] [appletrailers] Add support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/appletrailers.py | 167 ++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 youtube_dl/extractor/appletrailers.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 27bbcc0f7..2f86f2aca 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,4 @@ +from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 000000000..7d126e2d2 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,167 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, +) + + +class AppleTrailersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _TEST = { + u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", + u"playlist": [ + { + u"file": u"manofsteel-trailer4.mov", + u"md5": u"11874af099d480cc09e103b189805d5f", + u"info_dict": { + u"duration": 111, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", + u"title": u"Trailer 4", + u"upload_date": u"20130523", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer3.mov", + u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"info_dict": { + u"duration": 182, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", + u"title": u"Trailer 3", + u"upload_date": u"20130417", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer.mov", + u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"info_dict": { + u"duration": 148, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", + u"title": u"Trailer", + u"upload_date": u"20121212", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-teaser.mov", + u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"info_dict": { + u"duration": 93, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", + u"title": u"Teaser", + u"upload_date": u"20120721", + u"uploader_id": u"wb", + }, + } + ] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + movie = mobj.group('movie') + uploader_id = mobj.group('company') + + playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_snippet = self._download_webpage(playlist_url, movie) + playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) + playlist_html = u'<html>' + playlist_cleaned + u'</html>' + + size_cache = {} + + doc = xml.etree.ElementTree.fromstring(playlist_html) + playlist = [] + for li in doc.findall('./div/ul/li'): + title = li.find('.//h3').text + video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() + thumbnail = li.find('.//img').attrib['src'] + + date_el = li.find('.//p') + upload_date = None + m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) + if m: + upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') + runtime_el = date_el.find('./br') + m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) + duration = None + if m: + duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + + formats = [] + for formats_el in li.findall('.//li/a'): + if formats_el.attrib['class'] != 'OverlayPanel': + continue + target = formats_el.attrib['target'] + + format_code = formats_el.text + if 'Automatic' in format_code: + continue + + size_q = formats_el.attrib['href'] + size_id = size_q.rpartition('#videos-')[2] + if size_id not in size_cache: + size_url = url + size_q + sizepage_html = self._download_webpage( + size_url, movie, + note=u'Downloading size info %s' % size_id, + errnote=u'Error while downloading size info %s' % size_id, + ) + _doc = xml.etree.ElementTree.fromstring(sizepage_html) + size_cache[size_id] = _doc + + sizepage_doc = size_cache[size_id] + links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') + for vid_a in links: + href = vid_a.get('href') + if not href.endswith(target): + continue + detail_q = href.partition('#')[0] + detail_url = url + '/' + detail_q + + m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) + detail_id = m.group('detail_id') + + detail_html = self._download_webpage( + detail_url, movie, + note=u'Downloading detail %s %s' % (detail_id, size_id), + errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) + ) + detail_doc = xml.etree.ElementTree.fromstring(detail_html) + movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') + assert movie_link_el.get('class') == 'movieLink' + movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') + ext = determine_ext(movie_link) + assert ext == 'mov' + + formats.append({ + 'format': format_code, + 'ext': ext, + 'url': movie_link, + }) + + info = { + '_type': 'video', + 'id': video_id, + 'title': title, + 'formats': formats, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'user_agent': 'QuickTime compatible (youtube-dl)', + } + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['ext'] + + playlist.append(info) + + return { + '_type': 'playlist', + 'id': movie, + 'entries': playlist, + } From 0e283428f777a23de3c5a522aa283f87cda1b40a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:18:39 +0200 Subject: [PATCH 12/16] HTTPError is in urllib.error in Python 3, not in http.error --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f78b5fe78..e6fa634a7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -61,7 +61,7 @@ except ImportError: # Python 2 import httplib as compat_http_client try: - from http.error import HTTPError as compat_HTTPError + from urllib.error import HTTPError as compat_HTTPError except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError From a1bb0f8773e0fff787ffe7bd1729073f3385d2ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:20:37 +0200 Subject: [PATCH 13/16] [cnn] remove debug print call. --- youtube_dl/extractor/cnn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 4338bd180..a79f881cd 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -33,7 +33,6 @@ class CNNIE(InfoExtractor): path = mobj.group('path') page_title = mobj.group('title') info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path - print(info_url) info_xml = self._download_webpage(info_url, page_title) info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) From 3e223834d9f358bc7cb1c3748dc63d1ab40d9b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:26:44 +0200 Subject: [PATCH 14/16] [youtube] update algo for length 88, thanks to @Ramhack (fixes #1328) --- devscripts/youtube_genalgo.py | 4 ++-- youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 6f1d6ef99..917e8f79d 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -14,9 +14,9 @@ tests = [ # 89 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), - # 88 + # 88 - vflapUV9V 2013/08/28 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", - "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), + "ioplkjhgfdsazxcvbnm12<4567890QWERTYUIOZLKJHGFDSAeXCVBNM!@#$%^&*()_-+={[]}|:;?/>.3"), # 87 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index af01c9da0..8e486afd0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -419,7 +419,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 89: return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] elif len(s) == 88: - return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] + return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: From 4f5f18acb93ea2bf70f80c7f76e6bb6b8dee3fbf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 10:28:16 +0200 Subject: [PATCH 15/16] [addanime] add file --- youtube_dl/extractor/addanime.py | 76 ++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 youtube_dl/extractor/addanime.py diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py new file mode 100644 index 000000000..46db8262f --- /dev/null +++ b/youtube_dl/extractor/addanime.py @@ -0,0 +1,76 @@ +import ast +import re + +from .common import InfoExtractor +from ..utils import ( + compat_HTTPError, + compat_str, + compat_urllib_parse, + compat_urllib_parse_urlparse, + + ExtractorError, +) + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'24MR3YO5SAS9.flv', + u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'info_dict': { + u"description": u"One Piece 606", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + try: + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError): + raise + + redir_webpage = ee.cause.read().decode('utf-8') + action = self._search_regex( + r'<form id="challenge-form" action="([^"]+)"', + redir_webpage, u'Redirect form') + vc = self._search_regex( + r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>', + redir_webpage, u'redirect vc value') + av = re.search( + r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', + redir_webpage) + if av is None: + raise ExtractorError(u'Cannot find redirect math task') + av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) + + parsed_url = compat_urllib_parse_urlparse(url) + av_val = av_res + len(parsed_url.netloc) + confirm_url = ( + parsed_url.scheme + u'://' + parsed_url.netloc + + action + '?' + + compat_urllib_parse.urlencode({ + 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) + self._download_webpage( + confirm_url, video_id, + note=u'Confirming after redirect') + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + webpage, u'video file URL') + video_title = self._og_search_title(webpage) + video_description = self._og_search_description(webpage) + + return { + '_type': 'video', + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description + } From af8bd6a82d140e5a776185707a9b21d5b8a9fe52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:55:31 +0200 Subject: [PATCH 16/16] Show the time taken to download in the same format as the ETA --- youtube_dl/FileDownloader.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 4f6a23835..7c5ac4bc2 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -63,6 +63,17 @@ class FileDownloader(object): converted = float(bytes) / float(1024 ** exponent) return '%.2f%s' % (converted, suffix) + @staticmethod + def format_seconds(seconds): + (mins, secs) = divmod(seconds, 60) + (hours, eta_mins) = divmod(mins, 60) + if hours > 99: + return '--:--:--' + if hours == 0: + return '%02d:%02d' % (mins, secs) + else: + return '%02d:%02d:%02d' % (hours, mins, secs) + @staticmethod def calc_percent(byte_counter, data_len): if data_len is None: @@ -78,14 +89,7 @@ class FileDownloader(object): return '--:--' rate = float(current) / dif eta = int((float(total) - float(current)) / rate) - (eta_mins, eta_secs) = divmod(eta, 60) - (eta_hours, eta_mins) = divmod(eta_mins, 60) - if eta_hours > 99: - return '--:--:--' - if eta_hours == 0: - return '%02d:%02d' % (eta_mins, eta_secs) - else: - return '%02d:%02d:%02d' % (eta_hours, eta_mins, eta_secs) + return FileDownloader.format_seconds(eta) @staticmethod def calc_speed(start, now, bytes): @@ -240,8 +244,8 @@ class FileDownloader(object): self.to_screen(u'[download] Download completed') else: clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') - self.to_screen(u'\r%s[download] 100%% of %s in %ss' % - (clear_line, data_len_str, int(tot_time))) + self.to_screen(u'\r%s[download] 100%% of %s in %s' % + (clear_line, data_len_str, self.format_seconds(tot_time))) def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename)