From d7e66d39a040886f940f4adf444be71e50e97391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 12 Oct 2013 21:34:04 +0200 Subject: [PATCH] Add an extractor for internetvideoarchive.com videos It's used by videodetective.com --- test/test_utils.py | 14 ++++ youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/internetvideoarchive.py | 71 ++++++++++++++++++++ youtube_dl/utils.py | 13 ++++ 4 files changed, 99 insertions(+) create mode 100644 youtube_dl/extractor/internetvideoarchive.py diff --git a/test/test_utils.py b/test/test_utils.py index ff2e9885b..f2c03d421 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -20,6 +20,7 @@ from youtube_dl.utils import ( unified_strdate, find_xpath_attr, get_meta_content, + xpath_with_ns, ) if sys.version_info < (3, 0): @@ -141,5 +142,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(get_meta('description'), u'foo & bar') self.assertEqual(get_meta('author'), 'Plato') + def test_xpath_with_ns(self): + testxml = u''' + + The Author + http://server.com/download.mp3 + + ''' + doc = xml.etree.ElementTree.fromstring(testxml) + find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) + self.assertTrue(find('media:song') is not None) + self.assertEqual(find('media:song/media:author').text, u'The Author') + self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bc191a012..e50a89149 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -62,6 +62,7 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .internetvideoarchive import InternetVideoArchiveIE from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py new file mode 100644 index 000000000..52e3f9eec --- /dev/null +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -0,0 +1,71 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + xpath_with_ns, + determine_ext, +) + + +class InternetVideoArchiveIE(InfoExtractor): + _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + + _TEST = { + u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + u'file': u'452693.mp4', + u'info_dict': { + u'title': u'SKYFALL', + u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', + u'duration': 156, + }, + } + + @staticmethod + def _build_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + + def _real_extract(self, url): + query = compat_urlparse.urlparse(url).query + query_dic = compat_urlparse.parse_qs(query) + video_id = query_dic['publishedid'][0] + url = self._build_url(query) + + flashconfiguration_xml = self._download_webpage(url, video_id, + u'Downloading flash configuration') + flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) + file_url = flashconfiguration.find('file').text + file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') + info_xml = self._download_webpage(file_url, video_id, + u'Downloading video info') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + item = info.find('channel/item') + + def _bp(p): + return xpath_with_ns(p, + {'media': 'http://search.yahoo.com/mrss/', + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'}) + formats = [] + for content in item.findall(_bp('media:group/media:content')): + attr = content.attrib + f_url = attr['url'] + formats.append({ + 'url': f_url, + 'ext': determine_ext(f_url), + 'width': int(attr['width']), + 'bitrate': int(attr['bitrate']), + }) + formats = sorted(formats, key=lambda f: f['bitrate']) + + info = { + 'id': video_id, + 'title': item.find('title').text, + 'formats': formats, + 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], + 'description': item.find('description').text, + 'duration': int(attr['duration']), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0457f3ded..3e81c308b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -230,6 +230,19 @@ else: return f return None +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character.