[comedycentral] fix extraction(closes #27905)

5 years ago · fa8f6d8580
parent 3bb7769c40
commit fa8f6d8580
4 changed files with 38 additions and 146 deletions
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@ -1,142 +1,51 @@
 from __future__ import unicode_literals
 from .mtv import MTVServicesInfoExtractor
 from .common import InfoExtractor
 class ComedyCentralIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
+    _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})'
        (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes)))
        /(?P<title>.*)'''
    _FEED_URL = 'http://comedycentral.com/feeds/mrss/'
    _TESTS = [{
-        'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
+        'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike',
-        'md5': 'c4f48e9eda1b16dd10add0744344b6d8',
+        'md5': 'b8acb347177c680ff18a292aa2166f80',
        'info_dict': {
-            'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+            'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025',
            'ext': 'mp4',
-            'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother',
+            'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike',
-            'description': 'After a certain point, breastfeeding becomes c**kblocking.',
+            'description': 'md5:5334307c433892b85f4f5e5ac9ef7498',
-            'timestamp': 1376798400,
+            'timestamp': 1598670000,
-            'upload_date': '20130818',
+            'upload_date': '20200829',
        },
    }, {
-        'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview',
+        'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314',
        'only_matching': True,
    }]
 class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor):
    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
        (?:full-episodes|shows(?=/[^/]+/full-episodes))
        /(?P<id>[^?]+)'''
    _FEED_URL = 'http://comedycentral.com/feeds/mrss/'
    _TESTS = [{
        'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028',
        'info_dict': {
            'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."',
            'title': 'November 28, 2016 - Ryan Speedo Green',
        },
        'playlist_count': 4,
    }, {
-        'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
+        'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        webpage = self._download_webpage(url, playlist_id)
        mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1')
        videos_info = self._get_videos_info(mgid)
        return videos_info
 class ToshIE(MTVServicesInfoExtractor):
    IE_DESC = 'Tosh.0'
    _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)'
    _FEED_URL = 'http://tosh.cc.com/feeds/mrss'
    _TESTS = [{
        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
        'info_dict': {
            'description': 'Tosh asked fans to share their summer plans.',
            'title': 'Twitter Users Share Summer Plans',
        },
        'playlist': [{
            'md5': 'f269e88114c1805bb6d7653fecea9e06',
            'info_dict': {
                'id': '90498ec2-ed00-11e0-aca6-0026b9414f30',
                'ext': 'mp4',
                'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans',
                'description': 'Tosh asked fans to share their summer plans.',
                'thumbnail': r're:^https?://.*\.jpg',
                # It's really reported to be published on year 2077
                'upload_date': '20770610',
                'timestamp': 3390510600,
                'subtitles': {
                    'en': 'mincount:3',
                },
            },
        }]
    }, {
        'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp',
        'only_matching': True,
    }]
 class ComedyCentralTVIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})'
    _TESTS = [{
-        'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4',
+        'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1',
        'info_dict': {
-            'id': 'local_playlist-f99b626bdfe13568579a',
+            'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285',
-            'ext': 'flv',
+            'ext': 'mp4',
-            'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1',
+            'title': 'Josh Investigates',
-        },
+            'description': 'Steht uns das Ende der Welt bevor?',
        'params': {
            # rtmp download
            'skip_download': True,
        },
    }, {
        'url': 'http://www.comedycentral.tv/shows/1074-workaholics',
        'only_matching': True,
    }, {
        'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus',
        'only_matching': True,
    }]
-
+    _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
-    def _real_extract(self, url):
+    _GEO_COUNTRIES = ['DE']
-        video_id = self._match_id(url)
+
-
+    def _get_feed_query(self, uri):
-        webpage = self._download_webpage(url, video_id)
+        return {
-
+            'accountOverride': 'intl.mtvi.com',
-        mrss_url = self._search_regex(
+            'arcEp': 'web.cc.tv',
-            r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            'ep': 'b9032c3a',
-            webpage, 'mrss url', group='url')
+            'imageEp': 'web.cc.tv',
-
+            'mgid': uri,
        return self._get_videos_info_from_url(mrss_url, video_id)
 class ComedyCentralShortnameIE(InfoExtractor):
    _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$'
    _TESTS = [{
        'url': ':tds',
        'only_matching': True,
    }, {
        'url': ':thedailyshow',
        'only_matching': True,
    }, {
        'url': ':theopposition',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        shortcut_map = {
            'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
            'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
            'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes',
        }
        return self.url_result(shortcut_map[video_id])
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -235,11 +235,8 @@ from .cnn import (
 )
 from .coub import CoubIE
 from .comedycentral import (
    ComedyCentralFullEpisodesIE,
    ComedyCentralIE,
    ComedyCentralShortnameIE,
    ComedyCentralTVIE,
    ToshIE,
 )
 from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
 from .commonprotocols import (
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@ -253,6 +253,10 @@ class MTVServicesInfoExtractor(InfoExtractor):
        return try_get(feed, lambda x: x['result']['data']['id'], compat_str)
    @staticmethod
    def _extract_child_with_type(parent, t):
        return next(c for c in parent['children'] if c.get('type') == t)
    def _extract_mgid(self, webpage):
        try:
            # the url can be http://media.mtvnservices.com/fb/{mgid}.swf
@ -278,6 +282,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
        if not mgid:
            mgid = self._extract_triforce_mgid(webpage)
        if not mgid:
            data = self._parse_json(self._search_regex(
                r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
            main_container = self._extract_child_with_type(data, 'MainContainer')
            video_player = self._extract_child_with_type(main_container, 'VideoPlayer')
            mgid = video_player['props']['media']['video']['config']['uri']
        return mgid
    def _real_extract(self, url):
@ -349,18 +360,6 @@ class MTVIE(MTVServicesInfoExtractor):
        'only_matching': True,
    }]
    @staticmethod
    def extract_child_with_type(parent, t):
        children = parent['children']
        return next(c for c in children if c.get('type') == t)
    def _extract_mgid(self, webpage):
        data = self._parse_json(self._search_regex(
            r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
        main_container = self.extract_child_with_type(data, 'MainContainer')
        video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
        return video_player['props']['media']['video']['config']['uri']
 class MTVJapanIE(MTVServicesInfoExtractor):
    IE_NAME = 'mtvjapan'
--- a/youtube_dl/extractor/spike.py
+++ b/youtube_dl/extractor/spike.py
@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor):
    _FEED_URL = 'http://www.bellator.com/feeds/mrss/'
    _GEO_COUNTRIES = ['US']
    def _extract_mgid(self, webpage):
        return self._extract_triforce_mgid(webpage)
 class ParamountNetworkIE(MTVServicesInfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)'
@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor):
    def _get_feed_query(self, uri):
        return {
            'arcEp': 'paramountnetwork.com',
            'imageEp': 'paramountnetwork.com',
            'mgid': uri,
        }
    def _extract_mgid(self, webpage):
        root_data = self._parse_json(self._search_regex(
            r'window\.__DATA__\s*=\s*({.+})',
            webpage, 'data'), None)
        def find_sub_data(data, data_type):
            return next(c for c in data['children'] if c.get('type') == data_type)
        c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer')
        return c['props']['media']['video']['config']['uri']