Add extractors for video.mit.edu and techtv.mit.edu (closes #1327)

video.mit.edu just embeds the videos from techtv.mit.edu
12 years ago · 67b22dd036
parent ce6a696e4d
commit 67b22dd036
2 changed files with 77 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -50,6 +50,7 @@ from .keek import KeekIE
 from .liveleak import LiveLeakIE
 from .livestream import LivestreamIE
 from .metacafe import MetacafeIE
 from .mit import TechTVMITIE, MITIE
 from .mixcloud import MixcloudIE
 from .mtv import MTVIE
 from .muzu import MuzuTVIE
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@ -0,0 +1,76 @@
 import re
 import json
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
    get_element_by_id,
 )
 class TechTVMITIE(InfoExtractor):
    IE_NAME = u'techtv.mit.edu'
    _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
    _TEST = {
        u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
        u'file': u'25418.mp4',
        u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
        u'info_dict': {
            u'title': u'MIT DNA Learning Center Set',
            u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
        },
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(
            'http://techtv.mit.edu/videos/%s' % video_id, video_id)
        embed_page = self._download_webpage(
            'http://techtv.mit.edu/embeds/%s/' % video_id, video_id,
            note=u'Downloading embed page')
        base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
            embed_page, u'base url')
        formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page,
            u'video formats')
        formats = json.loads(formats_json)
        formats = sorted(formats, key=lambda f: f['bitrate'])
        title = get_element_by_id('edit-title', webpage)
        description = clean_html(get_element_by_id('edit-description', webpage))
        thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
            embed_page, u'thumbnail', flags=re.DOTALL)
        return {'id': video_id,
                'title': title,
                'url': base_url + formats[-1]['url'].replace('mp4:', ''),
                'ext': 'mp4',
                'description': description,
                'thumbnail': thumbnail,
                }
 class MITIE(TechTVMITIE):
    IE_NAME = u'video.mit.edu'
    _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
    _TEST = {
        u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
        u'file': u'21783.mp4',
        u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
        u'info_dict': {
            u'title': u'The Government is Profiling You',
            u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
        },
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        page_title = mobj.group('title')
        webpage = self._download_webpage(url, page_title)
        self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
        embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
            u'embed url')
        return self.url_result(embed_url, ie='TechTVMIT')