From 1e399778ee870ee583135e65458268cd7c0fb923 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 20:03:05 +0800 Subject: [PATCH 1/5] [letv] Fix extraction Using data URIs for passing the decrypted M3U8 manifest, which is supported by ffmpeg only. --- youtube_dl/extractor/letv.py | 70 ++++++++++++++++++++++++++---------- youtube_dl/utils.py | 5 +++ 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index a28abb0f00..9ebbc8089e 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -9,13 +9,14 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, - compat_urlparse, + compat_ord, ) from ..utils import ( determine_ext, ExtractorError, parse_iso8601, int_or_none, + encode_data_uri, ) @@ -25,15 +26,16 @@ class LetvIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.letv.com/ptv/vplay/22005890.html', - 'md5': 'cab23bd68d5a8db9be31c9a222c1e8df', + 'md5': 'edadcfe5406976f42f9f266057ee5e40', 'info_dict': { 'id': '22005890', 'ext': 'mp4', 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', - 'timestamp': 1424747397, - 'upload_date': '20150224', 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', - } + }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'url': 'http://www.letv.com/ptv/vplay/1415246.html', 'info_dict': { @@ -42,16 +44,22 @@ class LetvIE(InfoExtractor): 'title': '美人天下01', 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'note': 'This video is available only in Mainland China, thus a proxy is needed', 'url': 'http://www.letv.com/ptv/vplay/1118082.html', - 'md5': 'f80936fbe20fb2f58648e81386ff7927', + 'md5': '2424c74948a62e5f31988438979c5ad1', 'info_dict': { 'id': '1118082', 'ext': 'mp4', 'title': '与龙共舞 完整版', 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', }, + 'params': { + 'hls_prefer_native': True, + }, 'skip': 'Only available in China', }] @@ -74,6 +82,27 @@ class LetvIE(InfoExtractor): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # see M3U8Encryption class in KLetvPlayer.swf + @staticmethod + def decrypt_m3u8(encrypted_data): + if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': + return encrypted_data + encrypted_data = encrypted_data[5:] + + _loc4_ = bytearray() + while encrypted_data: + b = compat_ord(encrypted_data[0]) + _loc4_.extend([b // 16, b & 0x0f]) + encrypted_data = encrypted_data[1:] + idx = len(_loc4_) - 11 + _loc4_ = _loc4_[idx:] + _loc4_[:idx] + _loc7_ = bytearray() + while _loc4_: + _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) + _loc4_ = _loc4_[2:] + + return bytes(_loc7_) + def _real_extract(self, url): media_id = self._match_id(url) page = self._download_webpage(url, media_id) @@ -115,23 +144,28 @@ class LetvIE(InfoExtractor): for format_id in formats: if format_id in dispatch: media_url = playurl['domain'][0] + dispatch[format_id][0] - - # Mimic what flvxz.com do - url_parts = list(compat_urlparse.urlparse(media_url)) - qs = dict(compat_urlparse.parse_qs(url_parts[4])) - qs.update({ - 'platid': '14', - 'splatid': '1401', - 'tss': 'no', - 'retry': 1 + media_url += '&' + compat_urllib_parse.urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, }) - url_parts[4] = compat_urllib_parse.urlencode(qs) - media_url = compat_urlparse.urlunparse(url_parts) + + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) + + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) + + m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': media_url, + 'url': encode_data_uri(m3u8_data, 'application/x-mpegURL'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, + 'protocol': 'm3u8', } if format_id[-1:] == 'p': diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7dbe256616..db5b3698e7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import base64 import calendar import codecs import contextlib @@ -1795,6 +1796,10 @@ def urlhandle_detect_ext(url_handle): return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + def age_restricted(content_limit, age_limit): """ Returns True iff the content should be blocked """ From 985e4fdc07f00a3fdc8e7b7b4119471ee97f3890 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 17 Oct 2015 22:49:05 +0800 Subject: [PATCH 2/5] [downloader/hls] Add headers only for http(s) URLs ffmpeg 2.8.1 raises an error with -headers and non-http input files. --- youtube_dl/downloader/hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a62d2047bb..9a83a73dd6 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -30,7 +30,7 @@ class HlsFD(FileDownloader): args = [ffpp.executable, '-y'] - if info_dict['http_headers']: + if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. args += [ From 0a67a3632bb9cf76f64658986defc1947090ef50 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 17 Oct 2015 23:15:01 +0800 Subject: [PATCH 3/5] [compat] Add compat_urllib_request_DataHandler --- youtube_dl/compat.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 192e1c515e..d103ab9adf 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals +import binascii import collections +import email import getpass +import io import optparse import os import re @@ -38,6 +41,11 @@ try: except ImportError: # Python 2 import urlparse as compat_urlparse +try: + import urllib.response as compat_urllib_response +except ImportError: # Python 2 + import urllib as compat_urllib_response + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 @@ -155,6 +163,40 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) +try: + from urllib.request import DataHandler as compat_urllib_request_DataHandler +except ImportError: # Python < 3.4 + # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py + class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler): + def data_open(self, req): + # data URLs as specified in RFC 2397. + # + # ignores POSTed data + # + # syntax: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + url = req.get_full_url() + + scheme, data = url.split(":", 1) + mediatype, data = data.split(",", 1) + + # even base64 encoded data URLs might be quoted so unquote in any case: + data = compat_urllib_parse_unquote_to_bytes(data) + if mediatype.endswith(";base64"): + data = binascii.a2b_base64(data) + mediatype = mediatype[:-7] + + if not mediatype: + mediatype = "text/plain;charset=US-ASCII" + + headers = email.message_from_string( + "Content-type: %s\nContent-length: %d\n" % (mediatype, len(data))) + + return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) + try: compat_basestring = basestring # Python 2 except NameError: @@ -489,6 +531,8 @@ __all__ = [ 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', 'compat_urllib_request', + 'compat_urllib_request_DataHandler', + 'compat_urllib_response', 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', From 8b172c2e10fb38c62c213673304c7e8dcd17b768 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 17 Oct 2015 23:16:40 +0800 Subject: [PATCH 4/5] [YoutubeDL] Use DataHandler --- youtube_dl/YoutubeDL.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index adf70d658b..12977bf808 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -37,6 +37,7 @@ from .compat import ( compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, + compat_urllib_request_DataHandler, ) from .utils import ( ContentTooShortError, @@ -1967,8 +1968,9 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + data_handler = compat_urllib_request_DataHandler() opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh) + proxy_handler, https_handler, cookie_processor, ydlh, data_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play From 05a3879f1c142cc2bf0287cde4690d8ccadcdc8f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 19:19:46 +0800 Subject: [PATCH 5/5] [letv] Update M3U8's MIME type The new MIME type appears in the following places: https://www.iana.org/assignments/media-types/media-types.xhtml#application https://hg.python.org/cpython/file/tip/Lib/mimetypes.py --- youtube_dl/extractor/letv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 9ebbc8089e..effd9eb922 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -162,7 +162,7 @@ class LetvIE(InfoExtractor): m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/x-mpegURL'), + 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, 'protocol': 'm3u8',