Merge branch 'maddoger-sportbox-fix'

10 years ago · 9123d64592
parent eeb23eb7ea b827a6015c
commit 9123d64592
3 changed files with 127 additions and 40 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -502,7 +502,10 @@ from .spiegel import SpiegelIE, SpiegelArticleIE
 from .spiegeltv import SpiegeltvIE
 from .spike import SpikeIE
 from .sport5 import Sport5IE
-from .sportbox import SportBoxIE
+from .sportbox import (
    SportBoxIE,
    SportBoxEmbedIE,
 )
 from .sportdeutschland import SportDeutschlandIE
 from .srf import SrfIE
 from .srmediathek import SRMediathekIE
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -32,6 +32,7 @@ from .brightcove import BrightcoveIE
 from .nbc import NBCSportsVPlayerIE
 from .ooyala import OoyalaIE
 from .rutv import RUTVIE
 from .sportbox import SportBoxEmbedIE
 from .smotri import SmotriIE
 from .condenast import CondeNastIE
 from .udn import UDNEmbedIE
@ -224,6 +225,37 @@ class GenericIE(InfoExtractor):
                'skip_download': True,
            },
        },
        # SportBox embed
        {
            'url': 'http://www.vestifinance.ru/articles/25753',
            'info_dict': {
                'id': '25753',
                'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
            },
            'playlist': [{
                'info_dict': {
                    'id': '370908',
                    'title': 'Госзаказ. День 3',
                    'ext': 'mp4',
                }
            }, {
                'info_dict': {
                    'id': '370905',
                    'title': 'Госзаказ. День 2',
                    'ext': 'mp4',
                }
            }, {
                'info_dict': {
                    'id': '370902',
                    'title': 'Госзаказ. День 1',
                    'ext': 'mp4',
                }
            }],
            'params': {
                # m3u8 download
                'skip_download': True,
            },
        },
        # Embedded TED video
        {
            'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@ -1229,6 +1261,11 @@ class GenericIE(InfoExtractor):
        if rutv_url:
            return self.url_result(rutv_url, 'RUTV')
        # Look for embedded SportBox player
        sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
        if sportbox_urls:
            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
        # Look for embedded TED player
        mobj = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
    parse_duration,
    parse_iso8601,
@ -11,9 +12,8 @@ from ..utils import (
 class SportBoxIE(InfoExtractor):
-    _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
+    _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
-    _TESTS = [
+    _TESTS = [{
        {
        'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
        'md5': 'ff56a598c2cf411a9a38a69709e97079',
        'info_dict': {
@ -33,8 +33,10 @@ class SportBoxIE(InfoExtractor):
    }, {
        'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
        'only_matching': True,
-        }
+    }, {
-    ]
+        'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@ -42,35 +44,80 @@ class SportBoxIE(InfoExtractor):
        webpage = self._download_webpage(url, display_id)
-        video_id = self._search_regex(
+        player = self._search_regex(
-            r'src="/vdl/player/media/(\d+)"', webpage, 'video id')
+            r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
        player = self._download_webpage(
            'http://news.sportbox.ru/vdl/player/media/%s' % video_id,
            display_id, 'Downloading player webpage')
        hls = self._search_regex(
            r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file')
        formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
        title = self._html_search_regex(
            r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
        description = self._html_search_regex(
-            r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False)
+            r'(?s)<div itemprop="description">(.+?)</div>',
            webpage, 'description', fatal=False)
        thumbnail = self._og_search_thumbnail(webpage)
        timestamp = parse_iso8601(self._search_regex(
-            r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
+            r'<span itemprop="uploadDate">([^<]+)</span>',
            webpage, 'timestamp', fatal=False))
        duration = parse_duration(self._html_search_regex(
-            r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
+            r'<meta itemprop="duration" content="PT([^"]+)">',
            webpage, 'duration', fatal=False))
        return {
-            'id': video_id,
+            '_type': 'url_transparent',
            'url': compat_urlparse.urljoin(url, '/%s' % player),
            'display_id': display_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
            'duration': duration,
        }
 class SportBoxEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
        'info_dict': {
            'id': '211355',
            'ext': 'mp4',
            'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
            'thumbnail': 're:^https?://.*\.jpg$',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
    }, {
        'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
        'only_matching': True,
    }]
    @staticmethod
    def _extract_urls(webpage):
        return re.findall(
            r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"',
            webpage)
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        hls = self._search_regex(
            r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]",
            webpage, 'hls file')
        formats = self._extract_m3u8_formats(hls, video_id, 'mp4')
        title = self._search_regex(
            r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title')
        thumbnail = self._search_regex(
            r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"',
            webpage, 'thumbnail', default=None)
        return {
            'id': video_id,
            'title': title,
            'thumbnail': thumbnail,
            'formats': formats,
        }