From 2d49720f89ac47a095f1c115404b6ae801aced22 Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Tue, 1 Feb 2022 09:32:13 +0200 Subject: [PATCH] [ertgr] Add new extractors (#2338) Authored-by: zmousm, dirkf --- yt_dlp/extractor/ertgr.py | 316 +++++++++++++++++++++++++++++++++ yt_dlp/extractor/extractors.py | 5 + yt_dlp/extractor/generic.py | 17 ++ 3 files changed, 338 insertions(+) create mode 100644 yt_dlp/extractor/ertgr.py diff --git a/yt_dlp/extractor/ertgr.py b/yt_dlp/extractor/ertgr.py new file mode 100644 index 000000000..19ce23f01 --- /dev/null +++ b/yt_dlp/extractor/ertgr.py @@ -0,0 +1,316 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + dict_get, + int_or_none, + merge_dicts, + parse_qs, + parse_age_limit, + parse_iso8601, + str_or_none, + try_get, + unescapeHTML, + url_or_none, + variadic, +) + + +class ERTFlixBaseIE(InfoExtractor): + def _call_api( + self, video_id, method='Player/AcquireContent', api_version=1, + param_headers=None, data=None, headers=None, **params): + platform_codename = {'platformCodename': 'www'} + headers_as_param = {'X-Api-Date-Format': 'iso', 'X-Api-Camel-Case': False} + headers_as_param.update(param_headers or {}) + headers = headers or {} + if data: + headers['Content-Type'] = headers_as_param['Content-Type'] = 'application/json;charset=utf-8' + data = json.dumps(merge_dicts(platform_codename, data)).encode('utf-8') + query = merge_dicts( + {} if data else platform_codename, + {'$headers': json.dumps(headers_as_param)}, + params) + response = self._download_json( + 'https://api.app.ertflix.gr/v%s/%s' % (str(api_version), method), + video_id, fatal=False, query=query, data=data, headers=headers) + if try_get(response, lambda x: x['Result']['Success']) is True: + return response + + def _call_api_get_tiles(self, video_id, *tile_ids): + requested_tile_ids = [video_id] + list(tile_ids) + requested_tiles = [{'Id': tile_id} for tile_id in requested_tile_ids] + tiles_response = self._call_api( + video_id, method='Tile/GetTiles', api_version=2, + data={'RequestedTiles': requested_tiles}) + tiles = try_get(tiles_response, lambda x: x['Tiles'], list) or [] + if tile_ids: + if sorted([tile['Id'] for tile in tiles]) != sorted(requested_tile_ids): + raise ExtractorError('Requested tiles not found', video_id=video_id) + return tiles + try: + return next(tile for tile in tiles if tile['Id'] == video_id) + except StopIteration: + raise ExtractorError('No matching tile found', video_id=video_id) + + +class ERTFlixCodenameIE(ERTFlixBaseIE): + IE_NAME = 'ertflix:codename' + IE_DESC = 'ERTFLIX videos by codename' + _VALID_URL = r'ertflix:(?P[\w-]+)' + _TESTS = [{ + 'url': 'ertflix:monogramma-praxitelis-tzanoylinos', + 'md5': '5b9c2cd171f09126167e4082fc1dd0ef', + 'info_dict': { + 'id': 'monogramma-praxitelis-tzanoylinos', + 'ext': 'mp4', + 'title': 'md5:ef0b439902963d56c43ac83c3f41dd0e', + }, + }, + ] + + def _extract_formats_and_subs(self, video_id, allow_none=True): + media_info = self._call_api(video_id, codename=video_id) + formats, subs = [], {} + for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []: + for media in try_get(media_file, lambda x: x['Formats'], list) or []: + fmt_url = url_or_none(try_get(media, lambda x: x['Url'])) + if not fmt_url: + continue + ext = determine_ext(fmt_url) + if ext == 'm3u8': + formats_, subs_ = self._extract_m3u8_formats_and_subtitles( + fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False) + elif ext == 'mpd': + formats_, subs_ = self._extract_mpd_formats_and_subtitles( + fmt_url, video_id, mpd_id='dash', fatal=False) + else: + formats.append({ + 'url': fmt_url, + 'format_id': str_or_none(media.get('Id')), + }) + continue + formats.extend(formats_) + self._merge_subtitles(subs_, target=subs) + + if formats or not allow_none: + self._sort_formats(formats) + return formats, subs + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats, subs = self._extract_formats_and_subs(video_id) + + if formats: + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subs, + 'title': self._generic_title(url), + } + + +class ERTFlixIE(ERTFlixBaseIE): + IE_NAME = 'ertflix' + IE_DESC = 'ERTFLIX videos' + _VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P[a-z]{3}\.\d+)' + _TESTS = [{ + 'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates', + 'md5': '6479d5e60fd7e520b07ba5411dcdd6e7', + 'info_dict': { + 'id': 'aoratoi-ergates', + 'ext': 'mp4', + 'title': 'md5:c1433d598fbba0211b0069021517f8b4', + 'description': 'md5:01a64d113c31957eb7eb07719ab18ff4', + 'thumbnail': r're:https?://.+\.jpg', + 'episode_id': 'vod.173258', + 'timestamp': 1639648800, + 'upload_date': '20211216', + 'duration': 3166, + 'age_limit': 8, + }, + }, { + 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma', + 'info_dict': { + 'id': 'ser.3448', + 'age_limit': 8, + 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.', + 'title': 'Μονόγραμμα', + }, + 'playlist_mincount': 64, + }, { + 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1', + 'info_dict': { + 'id': 'ser.3448', + 'age_limit': 8, + 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.', + 'title': 'Μονόγραμμα', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1&season=2021%20-%202022', + 'info_dict': { + 'id': 'ser.3448', + 'age_limit': 8, + 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.', + 'title': 'Μονόγραμμα', + }, + 'playlist_mincount': 36, + }, { + 'url': 'https://www.ertflix.gr/series/ser.164991-to-diktuo-1?season=1-9', + 'info_dict': { + 'id': 'ser.164991', + 'age_limit': 8, + 'description': 'Η πρώτη ελληνική εκπομπή με θεματολογία αποκλειστικά γύρω από το ίντερνετ.', + 'title': 'Το δίκτυο', + }, + 'playlist_mincount': 9, + }] + + def _extract_episode(self, episode): + codename = try_get(episode, lambda x: x['Codename'], compat_str) + title = episode.get('Title') + description = clean_html(dict_get(episode, ('ShortDescription', 'TinyDescription', ))) + if not codename or not title or not episode.get('HasPlayableStream', True): + return + thumbnail = next(( + url_or_none(thumb.get('Url')) + for thumb in variadic(dict_get(episode, ('Images', 'Image')) or {}) + if thumb.get('IsMain')), + None) + return { + '_type': 'url_transparent', + 'thumbnail': thumbnail, + 'id': codename, + 'episode_id': episode.get('Id'), + 'title': title, + 'alt_title': episode.get('Subtitle'), + 'description': description, + 'timestamp': parse_iso8601(episode.get('PublishDate')), + 'duration': episode.get('DurationSeconds'), + 'age_limit': self._parse_age_rating(episode), + 'url': 'ertflix:%s' % (codename, ), + } + + @staticmethod + def _parse_age_rating(info_dict): + return parse_age_limit( + info_dict.get('AgeRating') + or (info_dict.get('IsAdultContent') and 18) + or (info_dict.get('IsKidsContent') and 0)) + + def _extract_series(self, video_id, season_titles=None, season_numbers=None): + media_info = self._call_api(video_id, method='Tile/GetSeriesDetails', id=video_id) + + series = try_get(media_info, lambda x: x['Series'], dict) or {} + series_info = { + 'age_limit': self._parse_age_rating(series), + 'title': series.get('Title'), + 'description': dict_get(series, ('ShortDescription', 'TinyDescription', )), + } + if season_numbers: + season_titles = season_titles or [] + for season in try_get(series, lambda x: x['Seasons'], list) or []: + if season.get('SeasonNumber') in season_numbers and season.get('Title'): + season_titles.append(season['Title']) + + def gen_episode(m_info, season_titles): + for episode_group in try_get(m_info, lambda x: x['EpisodeGroups'], list) or []: + if season_titles and episode_group.get('Title') not in season_titles: + continue + episodes = try_get(episode_group, lambda x: x['Episodes'], list) + if not episodes: + continue + season_info = { + 'season': episode_group.get('Title'), + 'season_number': int_or_none(episode_group.get('SeasonNumber')), + } + try: + episodes = [(int(ep['EpisodeNumber']), ep) for ep in episodes] + episodes.sort() + except (KeyError, ValueError): + episodes = enumerate(episodes, 1) + for n, episode in episodes: + info = self._extract_episode(episode) + if info is None: + continue + info['episode_number'] = n + info.update(season_info) + yield info + + return self.playlist_result( + gen_episode(media_info, season_titles), playlist_id=video_id, **series_info) + + def _real_extract(self, url): + video_id = self._match_id(url) + if video_id.startswith('ser.'): + param_season = parse_qs(url).get('season', [None]) + param_season = [ + (have_number, int_or_none(v) if have_number else str_or_none(v)) + for have_number, v in + [(int_or_none(ps) is not None, ps) for ps in param_season] + if v is not None + ] + season_kwargs = { + k: [v for is_num, v in param_season if is_num is c] or None + for k, c in + [('season_titles', False), ('season_numbers', True)] + } + return self._extract_series(video_id, **season_kwargs) + + return self._extract_episode(self._call_api_get_tiles(video_id)) + + +class ERTWebtvEmbedIE(InfoExtractor): + IE_NAME = 'ertwebtv:embed' + IE_DESC = 'ert.gr webtv embedded videos' + _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php') + _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P[^#&]+)' + + _TESTS = [{ + 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg', + 'md5': 'f9e9900c25c26f4ecfbddbb4b6305854', + 'info_dict': { + 'id': 'trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4', + 'title': 'md5:914f06a73cd8b62fbcd6fb90c636e497', + 'ext': 'mp4', + 'thumbnail': 'https://program.ert.gr/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg' + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' + EMBED_RE = rf']+?src=(?P<_q1>["\'])(?P{EMBED_URL_RE})(?P=_q1)' + + for mobj in re.finditer(EMBED_RE, webpage): + url = unescapeHTML(mobj.group('url')) + if not cls.suitable(url): + continue + yield url + + def _real_extract(self, url): + video_id = self._match_id(url) + formats, subs = self._extract_m3u8_formats_and_subtitles( + f'https://mediastream.ert.gr/vodedge/_definst_/mp4:dvrorigin/{video_id}/playlist.m3u8', + video_id, 'mp4') + self._sort_formats(formats) + thumbnail_id = parse_qs(url).get('bgimg', [None])[0] + if thumbnail_id and not thumbnail_id.startswith('http'): + thumbnail_id = f'https://program.ert.gr{thumbnail_id}' + return { + 'id': video_id, + 'title': f'VOD - {video_id}', + 'thumbnail': thumbnail_id, + 'formats': formats, + 'subtitles': subs, + } diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 384a9d415..82bd686ff 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -433,6 +433,11 @@ from .eroprofile import ( EroProfileIE, EroProfileAlbumIE, ) +from .ertgr import ( + ERTFlixCodenameIE, + ERTFlixIE, + ERTWebtvEmbedIE, +) from .escapist import EscapistIE from .espn import ( ESPNIE, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 934b354a9..131319d25 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -140,6 +140,7 @@ from .medialaan import MedialaanIE from .simplecast import SimplecastIE from .wimtv import WimTVIE from .tvopengr import TVOpenGrEmbedIE +from .ertgr import ERTWebtvEmbedIE from .tvp import TVPEmbedIE from .blogger import BloggerIE from .mainstreaming import MainStreamingIE @@ -1923,6 +1924,15 @@ class GenericIE(InfoExtractor): }, }] }, + { + 'url': 'https://www.ertnews.gr/video/manolis-goyalles-o-anthropos-piso-apo-ti-diadiktyaki-vasilopita/', + 'info_dict': { + 'id': '2022/tv/news-themata-ianouarios/20220114-apotis6-gouales-pita.mp4', + 'ext': 'mp4', + 'title': 'md5:df64f5b61c06d0e9556c0cdd5cf14464', + 'thumbnail': 'https://www.ert.gr/themata/photos/2021/20220114-apotis6-gouales-pita.jpg', + }, + }, { # ThePlatform embedded with whitespaces in URLs 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', @@ -3693,6 +3703,13 @@ class GenericIE(InfoExtractor): if tvopengr_urls: return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key()) + # Look for ert.gr webtv embeds + ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage)) + if len(ertwebtv_urls) == 1: + return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True) + elif ertwebtv_urls: + return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key()) + tvp_urls = TVPEmbedIE._extract_urls(webpage) if tvp_urls: return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())