From 56124b0ac4bc8a2c24473569e23663e081370929 Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Mon, 31 Mar 2025 17:35:50 +0100 Subject: [PATCH] changed based on D Trombett repo https://github.com/yt-dlp/yt-dlp/pull/12775/commits/c747e15cdf65ec4bf00f80f4cccd92832bd720fd --- yt_dlp/extractor/europa.py | 432 ++++++++++++++++++------------------- 1 file changed, 208 insertions(+), 224 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index d0f17c16fe..58b41816ee 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -1,272 +1,256 @@ -# -*- coding: utf-8 -*- from .common import InfoExtractor from ..utils import ( - ExtractorError, int_or_none, orderedSet, parse_duration, parse_iso8601, parse_qs, qualities, + str_or_none, traverse_obj, unified_strdate, + url_or_none, xpath_text, - js_to_json, - urljoin, - filter_dict, - HEADRequest, # Import HEADRequest ) -import re -import json -import urllib.error # Import urllib.error for HEAD check exception -# --- EuropaIE (Unchanged) --- + class EuropaIE(InfoExtractor): _WORKING = False _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P[A-Za-z0-9-]+)' - _TESTS = [ - # Existing tests... - ] + _TESTS = [{ + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', + 'md5': '574f080699ddd1e19a675b0ddf010371', + 'info_dict': { + 'id': 'I107758', + 'ext': 'mp4', + 'title': 'TRADE - Wikileaks on TTIP', + 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20150811', + 'duration': 34, + 'view_count': int, + 'formats': 'mincount:3', + }, + }, { + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', + 'only_matching': True, + }, { + 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en', + 'only_matching': True, + }] + def _real_extract(self, url): video_id = self._match_id(url) + playlist = self._download_xml( f'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID={video_id}', video_id) + def get_item(type_, preference): items = {} for item in playlist.findall(f'./info/{type_}/item'): - lang, label = (xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)) - if lang and label: items[lang] = label.strip() + lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) + if lang and label: + items[lang] = label.strip() for p in preference: - if items.get(p): return items[p] + if items.get(p): + return items[p] + query = parse_qs(url) preferred_lang = query.get('sitelang', ('en', ))[0] + preferred_langs = orderedSet((preferred_lang, 'en', 'int')) + title = get_item('title', preferred_langs) or video_id description = get_item('description', preferred_langs) thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) + language_preference = qualities(preferred_langs[::-1]) + formats = [] for file_ in playlist.findall('./files/file'): video_url = xpath_text(file_, './url') - if not video_url: continue + if not video_url: + continue lang = xpath_text(file_, './lg') - formats.append({'url': video_url, 'format_id': lang, 'format_note': xpath_text(file_, './lglabel'), 'language_preference': language_preference(lang)}) - return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats} + formats.append({ + 'url': video_url, + 'format_id': lang, + 'format_note': xpath_text(file_, './lglabel'), + 'language_preference': language_preference(lang), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } -# --- EuroParlWebstreamIE (Using JSON from iframe) --- class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?: - multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| # Webstreaming page URL - live\.media\.eup\.glcloud\.eu/hls/live/(?P\d+)/(?Pchannel-\d+-\w+|[\w-]+)/(?Pindex-archive|index|master|playlist|norsk-archive)(?:\.m3u8)? # Direct HLS URL base - ) + https?://multimedia\.europarl\.europa\.eu/ + (?P[^/]*/)?webstreaming/(?:[^_]*_)?(?P[\w-]+) ''' - _TESTS = [ - { - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-agriculture-and-rural-development_20250327-0900-COMMITTEE-AGRI', - 'info_dict': { - 'id': '20250327-0900-COMMITTEE-AGRI', - 'title': r're:^Committee on Agriculture and Rural Development \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': False, - 'ext': 'mp4', - }, - 'params': {'skip_download': True}, - # Uses the iframe JSON parsing which should yield 2113752 / channel-06-bxl + _TESTS = [{ + 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', + 'md5': '16420ad9c602663759538ac1ca16a8db', + 'info_dict': { + 'id': '20220914-0900-PLENARY', + 'ext': 'mp4', + 'title': 'Plenary session', + 'description': '', + 'duration': 45147, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'release_timestamp': 1663139069, + 'release_date': '20220914', + 'modified_timestamp': 1663650921, + 'modified_date': '20220920', + 'live_status': 'was_live', }, - { - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/pre-session-briefing_20250328-1100-SPECIAL-PRESSEr', - 'info_dict': { - 'id': '20250328-1100-SPECIAL-PRESSEr', - 'title': r're:^Pre-session briefing \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': False, - 'ext': 'mp4', - }, - 'params': {'skip_download': True}, - # Uses the iframe JSON parsing which should yield 2113747 / channel-01-bxl + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', + 'md5': '8b4304f9e15a6e133100248fb55a5dce', + 'info_dict': { + 'ext': 'mp4', + 'id': '20221115-1000-SPECIAL-EUROSCOLA', + 'release_timestamp': 1668502798, + 'title': 'Euroscola', + 'release_date': '20221115', + 'live_status': 'was_live', + 'description': '', + 'duration': 9587, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'modified_timestamp': 1668945274, + 'modified_date': '20221120', }, - { # Test direct HLS URL with archive times - 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113752/channel-06-bxl/index-archive.m3u8?startTime=1743068400&endTime=1743079800', - 'info_dict': { - 'id': 'index-archive', - 'title': 'European Parliament Stream 2113752/channel-06-bxl', - 'is_live': False, # Should be detected as not live from lack of live tags/duration - 'ext': 'mp4', - }, - 'params': {'skip_download': True}, + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', + 'md5': '0ca01cf33009d866e6f5e1cd3088c10c', + 'info_dict': { + 'id': '20230301-1130-COMMITTEE-CULT', + 'ext': 'mp4', + 'release_date': '20230301', + 'title': 'Committee on Culture and Education', + 'release_timestamp': 1677666641, + 'description': 'Committee on Culture and Education', + 'duration': 1003, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'modified_timestamp': 1732475771, + 'modified_date': '20241124', + 'live_status': 'was_live', }, - # Potentially add a known live stream test if one is available - ] - - def _log_debug(self, msg): - self.to_screen(f"[EuroParlWebstream] {msg}") - - def _extract_title_from_webpage(self, webpage, display_id): - """Extracts title from the main webstreaming page.""" - title_element = self._search_regex(r']*>(.*?)', webpage, 'title element', default=None) - if title_element: - # Clean up potential extra whitespace and HTML entities - title = re.sub(r'\s+', ' ', title_element).strip() - title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=title) - else: - # Fallback using meta tags or just the ID - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, default=display_id) - return title.replace('_', ' ') # Replace underscores often used in IDs - - def _perform_head_check(self, url, display_id, note=''): - """Performs a HEAD request to check if the HLS URL likely exists.""" - self._log_debug(f'[{display_id}] Performing HEAD check {note}on: {url}') - try: - self._request_webpage(HEADRequest(url), display_id, note=f'HEAD check {note}') - self._log_debug(f'[{display_id}] HEAD check {note}successful.') - return True - except ExtractorError as e: - # Specifically catch HTTP errors, especially 404 - if isinstance(e.cause, urllib.error.HTTPError): - self._log_debug(f'[{display_id}] HEAD check {note}failed: {e.cause.code} {e.cause.reason}') - else: - self._log_debug(f'[{display_id}] HEAD check {note}failed: {e}') - return False + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI', + 'md5': 'f2e8c30935f956a7165c2f4f4b4ee090', + 'info_dict': { + 'id': '20230524-0900-COMMITTEE-ENVI', + 'ext': 'mp4', + 'release_date': '20230524', + 'title': 'Committee on Environment, Public Health and Food Safety', + 'release_timestamp': 1684912288, + 'live_status': 'was_live', + 'description': 'Committee on Environment, Public Health and Food Safety', + 'duration': 4831, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'modified_timestamp': 1732475771, + 'modified_date': '20241124', + }, + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20240320-1345-SPECIAL-PRESSER', + 'md5': '518758eb706471c4c4ef3a134034a5bd', + 'info_dict': { + 'id': '20240320-1345-SPECIAL-PRESSER', + 'ext': 'mp4', + 'release_date': '20240320', + 'title': 'md5:7c6c814cac55dea5e2d87bf8d3db2234', + 'release_timestamp': 1710939767, + 'description': 'md5:7c6c814cac55dea5e2d87bf8d3db2234', + 'duration': 927, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'modified_timestamp': 1732475771, + 'modified_date': '20241124', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20250328-1600-SPECIAL-PRESSER', + 'md5': 'dd1c5e67eb55e609998583d7c2966105', + 'info_dict': { + 'id': '20250328-1600-SPECIAL-PRESSER', + 'ext': 'mp4', + 'title': 'md5:04a2ab70c183dabe891a7cd190c3121d', + 'description': '', + 'duration': 1023, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'release_timestamp': 1743177199, + 'release_date': '20250328', + 'modified_timestamp': 1743180924, + 'modified_date': '20250328', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://multimedia.europarl.europa.eu/webstreaming/briefing-for-media-on-2024-european-elections_20240429-1000-SPECIAL-OTHER', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - display_id = mobj.group('id') - live_id_direct = mobj.group('live_id') - - # --- Handle Direct HLS URL Input --- - if live_id_direct: - self._log_debug(f"Processing Direct HLS URL: {url}") - channel_direct = mobj.group('channel') - stream_type_direct = mobj.group('stream_type') or 'stream' # Default name if not specified - base_url = f'https://live.media.eup.glcloud.eu/hls/live/{live_id_direct}/{channel_direct}/{stream_type_direct}' - - query_params_str = mobj.group(0).split('?', 1)[1] if '?' in mobj.group(0) else None - query_params = parse_qs(query_params_str) if query_params_str else {} - start_time_direct = traverse_obj(query_params, ('startTime', 0, {int_or_none})) - end_time_direct = traverse_obj(query_params, ('endTime', 0, {int_or_none})) - - # Construct the final URL ensuring .m3u8 is present - final_url = base_url + ('' if base_url.endswith('.m3u8') else '.m3u8') - if start_time_direct and end_time_direct: - final_url += f"?startTime={start_time_direct}&endTime={end_time_direct}" - elif query_params_str: # Append original query if not start/end time based - final_url += f"?{query_params_str}" - - # Basic title for direct URL - title = f'European Parliament Stream {live_id_direct}/{channel_direct}' - - # HEAD check is good even for direct URLs - if not self._perform_head_check(final_url, f"{live_id_direct}-{channel_direct}", '(direct)'): - raise ExtractorError(f'Direct HLS URL HEAD check failed: {final_url}', expected=True) - - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - final_url, display_id or stream_type_direct, 'mp4', m3u8_id='hls', fatal=True) - if not formats: raise ExtractorError(f'Could not extract formats from direct HLS URL: {final_url}', expected=True) - - return { - 'id': display_id or stream_type_direct, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': not (start_time_direct and end_time_direct) and '.m3u8' not in stream_type_direct # Guess based on URL structure - } - - # --- Handle Webstreaming Page URL --- - if not display_id: raise ExtractorError('Could not parse display ID from URL', expected=True) - - self._log_debug(f"Processing Webstreaming Page: {display_id}") - webpage = self._download_webpage(url, display_id) - title = self._extract_title_from_webpage(webpage, display_id) # Get title early - - self._log_debug(f'[{display_id}] Extracting metadata and iframe URL...') - nextjs_data = self._search_nextjs_data(webpage, display_id, default={}) - media_info = traverse_obj(nextjs_data, ('props', 'pageProps', 'mediaItem')) or {} - - # Get initial start time, but prioritize iframe JSON later - initial_start_timestamp = traverse_obj(media_info, ('mediaDate', {parse_iso8601}, {int_or_none})) - iframe_url = traverse_obj(media_info, 'iframeUrls') # Usually just one URL string - - self._log_debug(f'[{display_id}] Initial Start Time={initial_start_timestamp}, Iframe URL={iframe_url}') - - if not iframe_url: - raise ExtractorError(f'[{display_id}] Could not find iframe URL in page metadata.', expected=True) - - # --- Attempt Extraction from Iframe JSON --- - self._log_debug(f'[{display_id}] Attempting extraction from iframe: {iframe_url}') - try: - iframe_content = self._download_webpage(iframe_url, display_id, note='Downloading iframe content') - json_data_str = self._search_regex( - r'', - iframe_content, 'iframe JSON data', default=None) - - if not json_data_str: - raise ExtractorError('Could not find ng-state JSON in iframe content.') - - iframe_json = self._parse_json(json_data_str, display_id, fatal=True) - - # Extract required info from the JSON structure - player_url_base = traverse_obj(iframe_json, ('contentEventKey', 'playerUrl')) - start_time = traverse_obj(iframe_json, ('contentEventKey', 'startTime', {int_or_none})) - end_time = traverse_obj(iframe_json, ('contentEventKey', 'endTime', {int_or_none})) - is_live = traverse_obj(iframe_json, ('contentEventKey', 'live')) # boolean - # Use title from JSON if available and seems better - json_title = traverse_obj(iframe_json, ('contentEventKey', 'title')) - if json_title: title = json_title - - - self._log_debug(f'[{display_id}] Found in iframe JSON: playerUrl={player_url_base}, startTime={start_time}, endTime={end_time}, is_live={is_live}') - - if not player_url_base: - raise ExtractorError('Could not extract playerUrl from iframe JSON.') - - # For recorded streams (archives), startTime and endTime are essential - if not is_live and (start_time is None or end_time is None): - raise ExtractorError('Missing startTime or endTime in iframe JSON for recorded stream.') - - # Construct the final URL - # Ensure base URL doesn't already have query params before adding ours - player_url_base = player_url_base.split('?')[0] - if not player_url_base.endswith('.m3u8'): - player_url_base += '.m3u8' # Ensure correct extension - - if is_live: - final_player_url = player_url_base # Live streams don't use start/end times - else: - final_player_url = f"{player_url_base}?startTime={start_time}&endTime={end_time}" - - # Perform HEAD check on the constructed URL - if not self._perform_head_check(final_player_url, display_id, '(dynamic)'): - raise ExtractorError(f'Dynamic HLS URL from iframe failed HEAD check: {final_player_url}') - - # Extract formats - self._log_debug(f'[{display_id}] Extracting formats from {final_player_url}') - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - final_player_url, display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=True) # Use fatal=True, if extraction fails, it's an error - - if not formats: - raise ExtractorError(f'Could not extract M3U8 formats from {final_player_url}', expected=True) - - return { - 'id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - 'timestamp': start_time if not is_live else None, # Use JSON start time for VOD - 'duration': (end_time - start_time) if not is_live and start_time and end_time else None, - } - - except ExtractorError as e: - # Re-raise specific extractor errors - raise e - except Exception as e: - # Wrap unexpected errors - raise ExtractorError(f'[{display_id}] Error processing iframe content: {e}', cause=e) - - # This part should ideally not be reached if iframe extraction is mandatory - raise ExtractorError(f'[{display_id}] Failed to extract stream information from iframe.', expected=True) + lang, video_id = self._match_valid_url(url).group('lang', 'id') + query = { + 'lang': lang, + 'audio': lang, + 'autoplay': 'true', + 'logo': 'false', + 'muted': 'false', + 'fullscreen': 'true', + 'disclaimer': 'false', + 'multicast': 'true', + 'analytics': 'false', + } + webpage = self._download_webpage(f'https://control.eup.glcloud.eu/content-manager/content-page/{video_id}', + video_id, 'Downloading iframe', query=query) + stream_info = self._search_json(r'