Add an extractor for rottentomatoes.com and improve InternetVideoArchiveIE to get the best quality

12 years ago · 4b7b839f24
parent 3d60d33773
commit 4b7b839f24
4 changed files with 34 additions and 1 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -94,6 +94,7 @@ from .rbmaradio import RBMARadioIE
 from .redtube import RedTubeIE
 from .ringtv import RingTVIE
 from .ro220 import Ro220IE
 from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .rtlnow import RTLnowIE
 from .sina import SinaIE
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@ -4,6 +4,7 @@ import xml.etree.ElementTree
 from .common import InfoExtractor
 from ..utils import (
    compat_urlparse,
    compat_urllib_parse,
    xpath_with_ns,
    determine_ext,
 )
@ -26,6 +27,16 @@ class InternetVideoArchiveIE(InfoExtractor):
    def _build_url(query):
        return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
    @staticmethod
    def _clean_query(query):
        NEEDED_ARGS = ['publishedid', 'customerid']
        query_dic = compat_urlparse.parse_qs(query)
        cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS)
        # Other player ids return m3u8 urls
        cleaned_dic['playerid'] = '247'
        cleaned_dic['videokbrate'] = '100000'
        return compat_urllib_parse.urlencode(cleaned_dic)
    def _real_extract(self, url):
        query = compat_urlparse.urlparse(url).query
        query_dic = compat_urlparse.parse_qs(query)
@ -37,6 +48,11 @@ class InternetVideoArchiveIE(InfoExtractor):
        flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
        file_url = flashconfiguration.find('file').text
        file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
        # Replace some of the parameters in the query to get the best quality
        # and http links (no m3u8 manifests)
        file_url = re.sub(r'(?<=\?)(.+)$',
            lambda m: self._clean_query(m.group()),
            file_url)
        info_xml = self._download_webpage(file_url, video_id,
            u'Downloading video info')
        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
--- a/youtube_dl/extractor/rottentomatoes.py
+++ b/youtube_dl/extractor/rottentomatoes.py
@ -0,0 +1,16 @@
 from .videodetective import VideoDetectiveIE
 # It just uses the same method as videodetective.com,
 # the internetvideoarchive.com is extracted from the og:video property
 class RottenTomatoesIE(VideoDetectiveIE):
    _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)'
    _TEST = {
        u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/',
        u'file': '613340.mp4',
        u'info_dict': {
            u'title': u'TOY STORY 3',
            u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.',
        },
    }
--- a/youtube_dl/extractor/videodetective.py
+++ b/youtube_dl/extractor/videodetective.py
@ -16,7 +16,7 @@ class VideoDetectiveIE(InfoExtractor):
        u'info_dict': {
            u'title': u'KICK-ASS 2',
            u'description': u'md5:65ba37ad619165afac7d432eaded6013',
-            u'duration': 135,
+            u'duration': 138,
        },
    }