From 0fe2ff78e68ec03d56bf3d9434eb612ffb683977 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 4 May 2015 21:53:05 +0800 Subject: [PATCH] [NBC] Enhance embedURL extraction (closes #2549) --- test/test_utils.py | 5 +++++ youtube_dl/extractor/nbc.py | 11 +++++++++-- youtube_dl/utils.py | 8 ++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 6906a65c22..032d3656ae 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -53,6 +53,7 @@ from youtube_dl.utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + lowercase_escape, url_basename, urlencode_postdata, version_tuple, @@ -418,6 +419,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') + def test_lowercase_escape(self): + self.assertEqual(lowercase_escape('aä'), 'aä') + self.assertEqual(lowercase_escape('\\u0026'), '&') + def test_limit_length(self): self.assertEqual(limit_length(None, 12), None) self.assertEqual(limit_length('foo', 12), 'foo') diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6cbe03d0f6..dc2091be0d 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -10,6 +10,8 @@ from ..compat import ( from ..utils import ( ExtractorError, find_xpath_attr, + lowercase_escape, + unescapeHTML, ) @@ -46,18 +48,23 @@ class NBCIE(InfoExtractor): 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', }, 'skip': 'Only works from US', + }, + { + # This video has expired but with an escaped embedURL + 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', + 'skip': 'Expired' } ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._search_regex( + theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( [ r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', r'"embedURL"\s*:\s*"([^"]+)"' ], - webpage, 'theplatform url').replace('_no_endcard', '') + webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) if theplatform_url.startswith('//'): theplatform_url = 'http:' + theplatform_url return self.url_result(theplatform_url) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a5a5c317e0..1013f7c187 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1486,6 +1486,14 @@ def uppercase_escape(s): s) +def lowercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\u[0-9a-fA-F]{4}', + lambda m: unicode_escape(m.group(0))[0], + s) + + def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, compat_str):