[closertotruth] Update and improve (Closes #8680)

9 years ago · cb23192bc4
parent 41c1023300
commit cb23192bc4
2 changed files with 71 additions and 47 deletions
--- a/youtube_dl/extractor/closertotruth.py
+++ b/youtube_dl/extractor/closertotruth.py
@ -1,69 +1,92 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 class CloserToTruthIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(episodes/|(series|interviews)/(?:[^#]+#video-)?(?P<id>\d+))'
+    _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
-    _TESTS = [
+    _TESTS = [{
        {
        'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
            'md5': '5c548bde260a9247ddfdc07c7458ed29',
        'info_dict': {
            'id': '0_zof1ktre',
            'display_id': 'solutions-the-mind-body-problem',
            'ext': 'mov',
            'title': 'Solutions to the Mind-Body Problem?',
            'upload_date': '20140221',
            'timestamp': 1392956007,
            'uploader_id': 'CTTXML'
            }
        },
-        {
+        'params': {
-            'url': 'http://closertotruth.com/interviews/1725',
+            'skip_download': True,
            'md5': 'b00598fd6a38372edb976408f72c5792',
            'info_dict': {
                'id': '0_19qv5rn1',
                'ext': 'mov',
                'title': 'AyaFr-002 - Francisco J. Ayala',
                'upload_date': '20140307',
                'timestamp': 1394236431,
                'uploader_id': 'CTTXML'
            }
        },
-        {
+    }, {
        'url': 'http://closertotruth.com/episodes/how-do-brains-work',
            'md5': '4dd96aa0a5c296afa5c0bd24895c2f16',
        'info_dict': {
            'id': '0_iuxai6g6',
            'display_id': 'how-do-brains-work',
            'ext': 'mov',
            'title': 'How do Brains Work?',
            'upload_date': '20140221',
            'timestamp': 1392956024,
            'uploader_id': 'CTTXML'
            }
        },
-    ]
+        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://closertotruth.com/interviews/1725',
        'info_dict': {
            'id': '1725',
            'title': 'AyaFr-002',
        },
        'playlist_mincount': 2,
    }]
    def _real_extract(self, url):
-        video_id = self._match_id(url)
+        display_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
-        video_title = self._search_regex(r'<title>(.+) \|.+</title>', webpage, 'video title')
+        webpage = self._download_webpage(url, display_id)
-        entry_id = self._search_regex(r'<a[^>]+id="(?:video-%s|embed-kaltura)"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id")
+        partner_id = self._search_regex(
            r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
            webpage, 'kaltura partner_id')
-        interviewee_name = self._search_regex(r'<div id="(?:node_interview_full_group_white_wrapper|node_interview_series_full_group_ajax_content)"(?:.|\n)*<h3>(.*)</h3>.+', webpage, "video interviewee_name", False)
+        title = self._search_regex(
            r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
-        if interviewee_name:
+        select = self._search_regex(
-            video_title = video_title + ' - ' + interviewee_name
+            r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
            webpage, 'select version', default=None)
        if select:
            entry_ids = set()
            entries = []
            for mobj in re.finditer(
                    r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
                    webpage):
                entry_id = mobj.group('id')
                if entry_id in entry_ids:
                    continue
                entry_ids.add(entry_id)
                entries.append({
                    '_type': 'url_transparent',
                    'url': 'kaltura:%s:%s' % (partner_id, entry_id),
                    'ie_key': 'Kaltura',
                    'title': mobj.group('title'),
                })
            if entries:
                return self.playlist_result(entries, display_id, title)
-        p_id = self._search_regex(r'<script[^>]+src=["\'].+?partner_id/(\d+)', webpage, "kaltura partner_id")
+        entry_id = self._search_regex(
            r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
            webpage, 'kaltura entry_id', group='id')
        return {
            '_type': 'url_transparent',
-            'id': entry_id,
+            'display_id': display_id,
-            'url': 'kaltura:%s:%s' % (p_id, entry_id),
+            'url': 'kaltura:%s:%s' % (partner_id, entry_id),
            'ie_key': 'Kaltura',
-            'title': video_title
+            'title': title
        }
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -140,6 +140,7 @@ from .cliprs import ClipRsIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
 from .clipsyndicate import ClipsyndicateIE
 from .closertotruth import CloserToTruthIE
 from .cloudy import CloudyIE
 from .clubic import ClubicIE
 from .clyp import ClypIE