[megatvcom] Add extractors (#1980)

Authored by: zmousm
pull/2362/head
Zenon Mousmoulas 3 years ago committed by GitHub
parent fdf80059d9
commit 32b95bb643
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1326,6 +1326,10 @@ from .glomex import (
GlomexIE,
GlomexEmbedIE,
)
from .megatvcom import (
MegaTVComIE,
MegaTVComEmbedIE,
)
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import RuvIE

@ -102,6 +102,7 @@ from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .glomex import GlomexEmbedIE
from .megatvcom import MegaTVComEmbedIE
from .limelight import LimelightBaseIE
from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
@ -3484,6 +3485,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key())
# Look for megatv.com embeds
megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage))
if megatvcom_urls:
return self.playlist_from_matches(
megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls:

@ -0,0 +1,173 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
determine_ext,
ExtractorError,
extract_attributes,
get_element_by_class,
get_element_html_by_id,
HEADRequest,
parse_qs,
unescapeHTML,
unified_timestamp,
)
class MegaTVComBaseIE(InfoExtractor):
_PLAYER_DIV_ID = 'player_div_id'
def _extract_player_attrs(self, webpage):
player_el = get_element_html_by_id(self._PLAYER_DIV_ID, webpage)
return {
re.sub(r'^data-(?:kwik_)?', '', k): v
for k, v in extract_attributes(player_el).items()
if k not in ('id',)
}
class MegaTVComIE(MegaTVComBaseIE):
IE_NAME = 'megatvcom'
IE_DESC = 'megatv.com videos'
_VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:\d{4}/\d{2}/\d{2}|[^/]+/(?P<id>\d+))/(?P<slug>[^/]+)'
_TESTS = [{
'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
'info_dict': {
'id': '520979',
'ext': 'mp4',
'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
'description': 'md5:0209fa8d318128569c0d256a5c404db1',
'timestamp': 1634975747,
'upload_date': '20211023',
'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
},
}, {
'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
'md5': 'cba2085d45c1abeb8e7e9b7e1d6c0072',
'info_dict': {
'id': '527800',
'ext': 'mp4',
'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
'timestamp': 1636048859,
'upload_date': '20211104',
'display_id': 'epeisodio-65-12',
'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/16-1-1.jpg',
},
}]
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'slug')
_is_article = video_id is None
webpage = self._download_webpage(url, video_id or display_id)
if _is_article:
video_id = self._search_regex(
r'<article[^>]*\sid=["\']Article_(\d+)["\']', webpage, 'article id')
player_attrs = self._extract_player_attrs(webpage)
title = player_attrs.get('label') or self._og_search_title(webpage)
description = get_element_by_class(
'article-wrapper' if _is_article else 'story_content',
webpage)
description = clean_html(re.sub(r'<script[^>]*>[^<]+</script>', '', description))
if not description:
description = self._og_search_description(webpage)
thumbnail = player_attrs.get('image') or self._og_search_thumbnail(webpage)
timestamp = unified_timestamp(self._html_search_meta(
'article:published_time', webpage))
source = player_attrs.get('source')
if not source:
raise ExtractorError('No source found', video_id=video_id)
if determine_ext(source) == 'm3u8':
formats, subs = self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4')
else:
formats, subs = [{'url': source}], {}
if player_attrs.get('subs'):
self._merge_subtitles({'und': [{'url': player_attrs['subs']}]}, target=subs)
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'formats': formats,
'subtitles': subs,
}
class MegaTVComEmbedIE(MegaTVComBaseIE):
IE_NAME = 'megatvcom:embed'
IE_DESC = 'megatv.com embedded videos'
_VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)'
_EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')
_TESTS = [{
'url': 'https://www.megatv.com/embed/?p=2020520979',
'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
'info_dict': {
'id': '520979',
'ext': 'mp4',
'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
'description': 'md5:0209fa8d318128569c0d256a5c404db1',
'timestamp': 1634975747,
'upload_date': '20211023',
'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
},
}, {
'url': 'https://www.megatv.com/embed/?p=2020534081',
'md5': '6ac8b3ce4dc6120c802f780a1e6b3812',
'info_dict': {
'id': '534081',
'ext': 'mp4',
'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
'timestamp': 1636376351,
'upload_date': '20211108',
'display_id': 'neo-rekor-stin-timi-tou-ilektrikou-reymatos-pano-apo-ta-200e-i-xondriki-timi-tou-ilektrikou',
'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/Capture-266.jpg',
},
}]
@classmethod
def _extract_urls(cls, webpage):
for mobj in cls._EMBED_RE.finditer(webpage):
yield unescapeHTML(mobj.group('url'))
def _match_canonical_url(self, webpage):
LINK_RE = r'''(?x)
<link(?:
rel=(?P<_q1>["'])(?P<canonical>canonical)(?P=_q1)|
href=(?P<_q2>["'])(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)|
[^>]*?
)+>
'''
for mobj in re.finditer(LINK_RE, webpage):
canonical, href = mobj.group('canonical', 'href')
if canonical and href:
return unescapeHTML(href)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_attrs = self._extract_player_attrs(webpage)
canonical_url = player_attrs.get('share_url') or self._match_canonical_url(webpage)
if not canonical_url:
raise ExtractorError('canonical URL not found')
video_id = parse_qs(canonical_url)['p'][0]
# Defer to megatvcom as the metadata extracted from the embeddable page some
# times are slightly different, for the same video
canonical_url = self._request_webpage(
HEADRequest(canonical_url), video_id,
note='Resolve canonical URL',
errnote='Could not resolve canonical URL').geturl()
return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id)
Loading…
Cancel
Save