[sendtonews] Add new extractor

Used in CBSLocal. Part of #9522
9 years ago · 5ce3d5bd1b
parent 612b5f403e
commit 5ce3d5bd1b
2 changed files with 87 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -670,6 +670,7 @@ from .screencastomatic import ScreencastOMaticIE
 from .screenjunkies import ScreenJunkiesIE
 from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE
 from .senateisvp import SenateISVPIE
 from .sendtonews import SendtoNewsIE
 from .servingsys import ServingSysIE
 from .sexu import SexuIE
 from .shahid import ShahidIE
--- a/youtube_dl/extractor/sendtonews.py
+++ b/youtube_dl/extractor/sendtonews.py
@ -0,0 +1,86 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .jwplatform import JWPlatformBaseIE
 from ..compat import compat_parse_qs
 from ..utils import (
    ExtractorError,
    parse_duration,
 )
 class SendtoNewsIE(JWPlatformBaseIE):
    _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)'
    _TEST = {
        # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/
        'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes',
        'info_dict': {
            'id': 'GxfCe0Zo7D-175909-5588',
            'ext': 'mp4',
            'title': 'Recap: CLE 15, CIN 6',
            'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
            'duration': 49,
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
    }
    _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s'
    @classmethod
    def _extract_url(cls, webpage):
        mobj = re.search(r'''(?x)<script[^>]+src=([\'"])
            (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\?
                .*\bSC=(?P<SC>[0-9a-zA-Z-]+).*
            \1>''', webpage)
        if mobj:
            sk, mk, pk = mobj.group('SC').split('-')
            return cls._URL_TEMPLATE % (sk, mk, pk)
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        params = compat_parse_qs(mobj.group('query'))
        if 'SK' not in params or 'MK' not in params or 'PK' not in params:
            raise ExtractorError('Invalid URL', expected=True)
        video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]])
        webpage = self._download_webpage(url, video_id)
        jwplayer_data_str = self._search_regex(
            r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data')
        js_vars = {
            'w': 1024,
            'h': 768,
            'modeVar': 'html5',
        }
        for name, val in js_vars.items():
            js_val = '%d' % val if isinstance(val, int) else '"%s"' % val
            jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val)
        info_dict = self._parse_jwplayer_data(
            self._parse_json(jwplayer_data_str, video_id),
            video_id, require_title=False, rtmp_params={'no_resume': True})
        title = self._html_search_regex(
            r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title')
        description = self._html_search_regex(
            r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage,
            'description', fatal=False)
        duration = parse_duration(self._html_search_regex(
            r'<div[^>]+class="embedDetails">([0-9:]+)', webpage,
            'duration', fatal=False))
        info_dict.update({
            'title': title,
            'description': description,
            'duration': duration,
        })
        return info_dict