[megatvcom] Add extractors (#1980)

Authored by: zmousm
4 years ago · 32b95bb643
parent fdf80059d9
commit 32b95bb643
3 changed files with 184 additions and 0 deletions
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@ -1326,6 +1326,10 @@ from .glomex import (
    GlomexIE,
    GlomexEmbedIE,
 )
 from .megatvcom import (
    MegaTVComIE,
    MegaTVComEmbedIE,
 )
 from .rutv import RUTVIE
 from .ruutu import RuutuIE
 from .ruv import RuvIE
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@ -102,6 +102,7 @@ from .arte import ArteTVEmbedIE
 from .videopress import VideoPressIE
 from .rutube import RutubeIE
 from .glomex import GlomexEmbedIE
 from .megatvcom import MegaTVComEmbedIE
 from .limelight import LimelightBaseIE
 from .anvato import AnvatoIE
 from .washingtonpost import WashingtonPostIE
@ -3484,6 +3485,12 @@ class GenericIE(InfoExtractor):
            return self.playlist_from_matches(
                glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key())
        # Look for megatv.com embeds
        megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage))
        if megatvcom_urls:
            return self.playlist_from_matches(
                megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
        # Look for WashingtonPost embeds
        wapo_urls = WashingtonPostIE._extract_urls(webpage)
        if wapo_urls:
--- a/yt_dlp/extractor/megatvcom.py
+++ b/yt_dlp/extractor/megatvcom.py
@ -0,0 +1,173 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
    determine_ext,
    ExtractorError,
    extract_attributes,
    get_element_by_class,
    get_element_html_by_id,
    HEADRequest,
    parse_qs,
    unescapeHTML,
    unified_timestamp,
 )
 class MegaTVComBaseIE(InfoExtractor):
    _PLAYER_DIV_ID = 'player_div_id'
    def _extract_player_attrs(self, webpage):
        player_el = get_element_html_by_id(self._PLAYER_DIV_ID, webpage)
        return {
            re.sub(r'^data-(?:kwik_)?', '', k): v
            for k, v in extract_attributes(player_el).items()
            if k not in ('id',)
        }
 class MegaTVComIE(MegaTVComBaseIE):
    IE_NAME = 'megatvcom'
    IE_DESC = 'megatv.com videos'
    _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:\d{4}/\d{2}/\d{2}|[^/]+/(?P<id>\d+))/(?P<slug>[^/]+)'
    _TESTS = [{
        'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
        'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
        'info_dict': {
            'id': '520979',
            'ext': 'mp4',
            'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
            'description': 'md5:0209fa8d318128569c0d256a5c404db1',
            'timestamp': 1634975747,
            'upload_date': '20211023',
            'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
            'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
        },
    }, {
        'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
        'md5': 'cba2085d45c1abeb8e7e9b7e1d6c0072',
        'info_dict': {
            'id': '527800',
            'ext': 'mp4',
            'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
            'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
            'timestamp': 1636048859,
            'upload_date': '20211104',
            'display_id': 'epeisodio-65-12',
            'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/16-1-1.jpg',
        },
    }]
    def _real_extract(self, url):
        video_id, display_id = self._match_valid_url(url).group('id', 'slug')
        _is_article = video_id is None
        webpage = self._download_webpage(url, video_id or display_id)
        if _is_article:
            video_id = self._search_regex(
                r'<article[^>]*\sid=["\']Article_(\d+)["\']', webpage, 'article id')
        player_attrs = self._extract_player_attrs(webpage)
        title = player_attrs.get('label') or self._og_search_title(webpage)
        description = get_element_by_class(
            'article-wrapper' if _is_article else 'story_content',
            webpage)
        description = clean_html(re.sub(r'<script[^>]*>[^<]+</script>', '', description))
        if not description:
            description = self._og_search_description(webpage)
        thumbnail = player_attrs.get('image') or self._og_search_thumbnail(webpage)
        timestamp = unified_timestamp(self._html_search_meta(
            'article:published_time', webpage))
        source = player_attrs.get('source')
        if not source:
            raise ExtractorError('No source found', video_id=video_id)
        if determine_ext(source) == 'm3u8':
            formats, subs = self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4')
        else:
            formats, subs = [{'url': source}], {}
        if player_attrs.get('subs'):
            self._merge_subtitles({'und': [{'url': player_attrs['subs']}]}, target=subs)
        self._sort_formats(formats)
        return {
            'id': video_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
            'formats': formats,
            'subtitles': subs,
        }
 class MegaTVComEmbedIE(MegaTVComBaseIE):
    IE_NAME = 'megatvcom:embed'
    IE_DESC = 'megatv.com embedded videos'
    _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)'
    _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')
    _TESTS = [{
        'url': 'https://www.megatv.com/embed/?p=2020520979',
        'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
        'info_dict': {
            'id': '520979',
            'ext': 'mp4',
            'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
            'description': 'md5:0209fa8d318128569c0d256a5c404db1',
            'timestamp': 1634975747,
            'upload_date': '20211023',
            'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
            'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
        },
    }, {
        'url': 'https://www.megatv.com/embed/?p=2020534081',
        'md5': '6ac8b3ce4dc6120c802f780a1e6b3812',
        'info_dict': {
            'id': '534081',
            'ext': 'mp4',
            'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
            'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
            'timestamp': 1636376351,
            'upload_date': '20211108',
            'display_id': 'neo-rekor-stin-timi-tou-ilektrikou-reymatos-pano-apo-ta-200e-i-xondriki-timi-tou-ilektrikou',
            'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/Capture-266.jpg',
        },
    }]
    @classmethod
    def _extract_urls(cls, webpage):
        for mobj in cls._EMBED_RE.finditer(webpage):
            yield unescapeHTML(mobj.group('url'))
    def _match_canonical_url(self, webpage):
        LINK_RE = r'''(?x)
        <link(?:
            rel=(?P<_q1>["'])(?P<canonical>canonical)(?P=_q1)|
            href=(?P<_q2>["'])(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)|
            [^>]*?
        )+>
        '''
        for mobj in re.finditer(LINK_RE, webpage):
            canonical, href = mobj.group('canonical', 'href')
            if canonical and href:
                return unescapeHTML(href)
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        player_attrs = self._extract_player_attrs(webpage)
        canonical_url = player_attrs.get('share_url') or self._match_canonical_url(webpage)
        if not canonical_url:
            raise ExtractorError('canonical URL not found')
        video_id = parse_qs(canonical_url)['p'][0]
        # Defer to megatvcom as the metadata extracted from the embeddable page some
        # times are slightly different, for the same video
        canonical_url = self._request_webpage(
            HEADRequest(canonical_url), video_id,
            note='Resolve canonical URL',
            errnote='Could not resolve canonical URL').geturl()
        return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id)