From b652a8a6b14b61019e7638f0e6abc3c3a580b44a Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Tue, 25 Mar 2025 14:25:17 +0000 Subject: [PATCH 1/9] Update europa.py [europarl] Update extractor to support new stream URLs - Add support for live.media.eup.glcloud.eu direct HLS streams - Add live stream detection and handling without timestamps - Prioritise channel-07-bxl which is commonly used --- yt_dlp/extractor/europa.py | 357 ++++++++++++++++++++++++++++++------- 1 file changed, 291 insertions(+), 66 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index aa8baf2f78..7470305e95 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -9,7 +9,11 @@ from ..utils import ( traverse_obj, unified_strdate, xpath_text, + ExtractorError, ) +import re +import datetime +import time class EuropaIE(InfoExtractor): @@ -94,97 +98,318 @@ class EuropaIE(InfoExtractor): class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://multimedia\.europarl\.europa\.eu/ - (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) + https?://(?: + multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| + live\.media\.eup\.glcloud\.eu/hls/live/(?P\d+)/(?:channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w-]+)(?:\.m3u8|/master\.m3u8) + ) ''' _TESTS = [{ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { - 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', + 'id': '20220914-0900-PLENARY', 'display_id': '20220914-0900-PLENARY', 'ext': 'mp4', 'title': 'Plenary session', - 'release_timestamp': 1663139069, - 'release_date': '20220914', }, 'params': { 'skip_download': True, }, }, { - # live webstream - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', + # New URL format for direct HLS streams + 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime=1742828675&endTime=1742832870', 'info_dict': { + 'id': 'index-archive', 'ext': 'mp4', - 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715', - 'release_timestamp': 1668502800, - 'title': 'Euroscola 2022-11-15 19:21', - 'release_date': '20221115', - 'live_status': 'is_live', + 'title': 'European Parliament Stream', }, - 'skip': 'not live anymore', - }, { - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', - 'info_dict': { - 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7', - 'display_id': '20230301-1130-COMMITTEE-CULT', - 'ext': 'mp4', - 'release_date': '20230301', - 'title': 'Committee on Culture and Education', - 'release_timestamp': 1677666641, + 'params': { + 'skip_download': True, }, }, { - # live stream - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI', + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/special-committee-on-housing-crisis-in-european-union-ordinary-meeting_20250324-1500-COMMITTEE-HOUS', 'info_dict': { - 'id': 'e4255f56-10aa-4b3c-6530-08db56d5b0d9', + 'id': '20250324-1500-COMMITTEE-HOUS', + 'display_id': '20250324-1500-COMMITTEE-HOUS', 'ext': 'mp4', - 'release_date': '20230524', - 'title': r're:Committee on Environment, Public Health and Food Safety \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', - 'release_timestamp': 1684911541, - 'live_status': 'is_live', + 'title': 'Special committee on the Housing Crisis in the European Union Ordinary meeting', + 'is_live': False, }, - 'skip': 'Not live anymore', - }, { - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20240320-1345-SPECIAL-PRESSER', - 'info_dict': { - 'id': 'c1f11567-5b52-470a-f3e1-08dc3c216ace', - 'display_id': '20240320-1345-SPECIAL-PRESSER', - 'ext': 'mp4', - 'release_date': '20240320', - 'title': 'md5:7c6c814cac55dea5e2d87bf8d3db2234', - 'release_timestamp': 1710939767, + 'params': { + 'skip_download': True, }, - }, { - 'url': 'https://multimedia.europarl.europa.eu/webstreaming/briefing-for-media-on-2024-european-elections_20240429-1000-SPECIAL-OTHER', - 'only_matching': True, }] - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + # Known working stream IDs (in order of likely success) + _ARCHIVE_STREAM_IDS = [ + "index-archive", + "norsk-archive", + ] + + # Live stream IDs + _LIVE_STREAM_IDS = [ + "index", + "master", + "playlist", + "norsk", + ] - webpage_nextjs = self._search_nextjs_data(webpage, display_id)['props']['pageProps'] + # Known CDN endpoints (in order of likely success) + _ENDPOINTS = [ + "2113753", # This appears to be the main endpoint + "2113749", + "2113750", + "2113751", + "2113752", + "2113754", + ] - json_info = self._download_json( - 'https://acs-api.europarl.connectedviews.eu/api/FullMeeting', display_id, - query={ - 'api-version': 1.0, - 'tenantId': 'bae646ca-1fc8-4363-80ba-2c04f06b4968', - 'externalReference': display_id, - }) - - formats, subtitles = [], {} - for hls_url in traverse_obj(json_info, ((('meetingVideo'), ('meetingVideos', ...)), 'hlsUrl')): - fmt, subs = self._extract_m3u8_formats_and_subtitles(hls_url, display_id) - formats.extend(fmt) - self._merge_subtitles(subs, target=subtitles) + # Prioritized channel list based on observations (channel-07-bxl is often used) + _CHANNELS = [ + "channel-07-bxl", # Most common based on examples + "channel-03-bxl", # Also seen in examples + "channel-01-bxl", + "channel-02-bxl", + "channel-04-bxl", + "channel-05-bxl", + "channel-06-bxl", + "channel-08-bxl", + "channel-09-bxl", + "channel-10-bxl", + ] + def _parse_meeting_id(self, display_id): + """Extract date and time information from the meeting ID.""" + date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id) + if date_match: + date_str, time_str, meeting_type = date_match.groups() + try: + # Parse the date and time + year = int(date_str[:4]) + month = int(date_str[4:6]) + day = int(date_str[6:8]) + hour = int(time_str[:2]) + minute = int(time_str[2:4]) + + # Create datetime object + meeting_dt = datetime.datetime(year, month, day, hour, minute) + + # Calculate a reasonable meeting duration (2 hours by default) + end_dt = meeting_dt + datetime.timedelta(hours=2) + + # Check if meeting is today or in the future (potential live stream) + now = datetime.datetime.now() + is_today = (meeting_dt.year == now.year and + meeting_dt.month == now.month and + meeting_dt.day == now.day) + is_future = meeting_dt > now + is_recent_past = now - meeting_dt < datetime.timedelta(hours=6) + + return { + 'date': date_str, + 'time': time_str, + 'type': meeting_type, + 'start_dt': meeting_dt, + 'end_dt': end_dt, + 'start_timestamp': int(meeting_dt.timestamp()), + 'end_timestamp': int(end_dt.timestamp()), + 'is_today': is_today, + 'is_future': is_future, + 'is_recent_past': is_recent_past, + 'is_live_candidate': is_today or is_future or is_recent_past, + } + except (ValueError, OverflowError) as e: + self.report_warning(f"Failed to parse meeting date/time: {e}") + + # If we can't parse the date/time, use the current time minus 24 hours to now + current_time = int(time.time()) return { - 'id': json_info['id'], - 'display_id': display_id, - 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False), - 'formats': formats, - 'subtitles': subtitles, - 'release_timestamp': parse_iso8601(json_info.get('startDateTime')), - 'is_live': traverse_obj(webpage_nextjs, ('mediaItem', 'mediaSubType')) == 'Live', + 'start_timestamp': current_time - 86400, # 24 hours ago + 'end_timestamp': current_time, + 'is_live_candidate': True, # Assume it might be live if we can't parse the time } + + def _find_m3u8_in_webpage(self, webpage): + """Look for m3u8 URLs directly in the webpage.""" + m3u8_matches = re.findall( + r'[\'"]((https?://live\.media\.eup\.glcloud\.eu/[^"\']+\.m3u8(?:\?[^\'"]*)?)[\'"])', + webpage + ) + if m3u8_matches: + return [url[0].replace('\\/', '/').replace('\\\\', '\\') for url in m3u8_matches] + + return [] + + def _extract_title_from_webpage(self, webpage): + """Extract the title from the webpage.""" + title = self._html_search_regex( + r'([^<]+)', + webpage, 'title', default='European Parliament Stream') + + # Clean up title + title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip() + return title + + def _check_is_live(self, webpage): + """Check if the stream is likely to be live based on webpage content.""" + live_indicators = [ + r'(?i)live\s+now', + r'(?i)streaming\s+live', + r'(?i)watch\s+live', + r'(?i)live\s+stream', + r'(?i)currently\s+live', + r'(?i)livestream', + r'isLive\s*[:=]\s*true', + r'"isLive"\s*:\s*true', + r'data-is-live\s*=\s*["\'](true|1)["\']', + ] + + for indicator in live_indicators: + if re.search(indicator, webpage): + return True + + return False + + def _try_url(self, url, display_id): + """Try a single URL and return formats and subtitles if successful.""" + try: + self.to_screen(f"Trying URL: {url}") + fmt, subs = self._extract_m3u8_formats_and_subtitles( + url, display_id, 'mp4', m3u8_id='hls', fatal=False) + + if fmt: + return fmt, subs + except ExtractorError as e: + self.report_warning(f"Failed with URL {url}: {e}") + + return None, None + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') + live_id = mobj.group('live_id') + stream_id = mobj.group('stream_id') + + # Handle direct HLS stream URLs + if live_id and stream_id: + # Strip any query parameters from stream_id + if '?' in stream_id: + stream_id = stream_id.split('?')[0] + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + url, stream_id, 'mp4', m3u8_id='hls', fatal=False) + + return { + 'id': stream_id, + 'title': 'European Parliament Stream', + 'formats': formats, + 'subtitles': subtitles, + } + + # If we're dealing with a europarl.europa.eu URL, download the webpage first + webpage = self._download_webpage(url, display_id) + title = self._extract_title_from_webpage(webpage) + + # Check if this is likely to be a live stream + is_live_page = self._check_is_live(webpage) + + # First, look for m3u8 URLs directly in the page + direct_urls = self._find_m3u8_in_webpage(webpage) + if direct_urls: + self.to_screen(f"Found {len(direct_urls)} potential stream URLs in webpage") + for m3u8_url in direct_urls: + formats, subtitles = self._try_url(m3u8_url, display_id) + if formats: + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live_page, + } + + # Parse the meeting ID and check if this is potentially a live stream + meeting_info = self._parse_meeting_id(display_id) + start_timestamp = meeting_info.get('start_timestamp') + end_timestamp = meeting_info.get('end_timestamp') + is_live_candidate = meeting_info.get('is_live_candidate', False) or is_live_page + + self.to_screen(f"Generated timestamps for meeting: start={start_timestamp}, end={end_timestamp}") + self.to_screen(f"Stream is likely {'live' if is_live_candidate else 'archived'}") + + # First check for live streams if this is a live candidate + if is_live_candidate: + self.to_screen("Checking for live stream URLs first") + + for endpoint in self._ENDPOINTS[:2]: # Only try the first two endpoints for live + for channel in self._CHANNELS[:3]: # Only try the top 3 channels for live + for stream_type in self._LIVE_STREAM_IDS: + # For live streams, try URLs without timestamps + live_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8" + formats, subtitles = self._try_url(live_url, display_id) + + if formats: + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + # Try archived streams with prioritized channels + for channel in self._CHANNELS: + for stream_type in self._ARCHIVE_STREAM_IDS: + # For archived content, include timestamps + archive_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + formats, subtitles = self._try_url(archive_url, display_id) + + if formats: + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': False, + } + + # If main endpoint + prioritized channels didn't work, try other endpoints + for endpoint in self._ENDPOINTS[1:]: + for channel in self._CHANNELS[:3]: # Only try the top 3 channels for other endpoints + for stream_type in self._ARCHIVE_STREAM_IDS: + archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + formats, subtitles = self._try_url(archive_url, display_id) + + if formats: + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': False, + } + + # If we've reached here, we need to give a helpful error message + parsed_date = f"{meeting_info.get('date', 'unknown-date')}" + parsed_time = f"{meeting_info.get('time', 'unknown-time')}" + + # Provide different suggestions based on whether it's likely live or archived + if is_live_candidate: + suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8" + suggestion_text = f"For live streams, try: yt-dlp \"{suggested_url}\"" + else: + suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + suggestion_text = f"For archived content, try: yt-dlp \"{suggested_url}\"" + + raise ExtractorError( + f"Could not extract stream URL for {display_id}. The European Parliament stream may not be available.\n" + f"Attempted to find a {'live' if is_live_candidate else 'archived'} stream for date: {parsed_date}, time: {parsed_time}.\n" + f"{suggestion_text}", + expected=True + ) From 2afe7e2fa3b21ab6f6879a05a06043eedbc864eb Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Tue, 25 Mar 2025 16:02:51 +0000 Subject: [PATCH 2/9] Update europa.py this makes it work with videos from the archive but not live videos --- yt_dlp/extractor/europa.py | 205 ++++++++++++++----------------------- 1 file changed, 76 insertions(+), 129 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 7470305e95..c3a03bf591 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -10,10 +10,13 @@ from ..utils import ( unified_strdate, xpath_text, ExtractorError, + js_to_json, + urljoin ) import re -import datetime +import json import time +import datetime class EuropaIE(InfoExtractor): @@ -132,7 +135,6 @@ class EuroParlWebstreamIE(InfoExtractor): 'display_id': '20250324-1500-COMMITTEE-HOUS', 'ext': 'mp4', 'title': 'Special committee on the Housing Crisis in the European Union Ordinary meeting', - 'is_live': False, }, 'params': { 'skip_download': True, @@ -140,21 +142,13 @@ class EuroParlWebstreamIE(InfoExtractor): }] # Known working stream IDs (in order of likely success) - _ARCHIVE_STREAM_IDS = [ + KNOWN_STREAM_IDS = [ "index-archive", "norsk-archive", ] - - # Live stream IDs - _LIVE_STREAM_IDS = [ - "index", - "master", - "playlist", - "norsk", - ] # Known CDN endpoints (in order of likely success) - _ENDPOINTS = [ + KNOWN_ENDPOINTS = [ "2113753", # This appears to be the main endpoint "2113749", "2113750", @@ -164,7 +158,7 @@ class EuroParlWebstreamIE(InfoExtractor): ] # Prioritized channel list based on observations (channel-07-bxl is often used) - _CHANNELS = [ + PRIORITIZED_CHANNELS = [ "channel-07-bxl", # Most common based on examples "channel-03-bxl", # Also seen in examples "channel-01-bxl", @@ -179,6 +173,7 @@ class EuroParlWebstreamIE(InfoExtractor): def _parse_meeting_id(self, display_id): """Extract date and time information from the meeting ID.""" + # Format: YYYYMMDD-HHMM-COMMITTEE-TYPE date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id) if date_match: date_str, time_str, meeting_type = date_match.groups() @@ -196,14 +191,6 @@ class EuroParlWebstreamIE(InfoExtractor): # Calculate a reasonable meeting duration (2 hours by default) end_dt = meeting_dt + datetime.timedelta(hours=2) - # Check if meeting is today or in the future (potential live stream) - now = datetime.datetime.now() - is_today = (meeting_dt.year == now.year and - meeting_dt.month == now.month and - meeting_dt.day == now.day) - is_future = meeting_dt > now - is_recent_past = now - meeting_dt < datetime.timedelta(hours=6) - return { 'date': date_str, 'time': time_str, @@ -212,10 +199,6 @@ class EuroParlWebstreamIE(InfoExtractor): 'end_dt': end_dt, 'start_timestamp': int(meeting_dt.timestamp()), 'end_timestamp': int(end_dt.timestamp()), - 'is_today': is_today, - 'is_future': is_future, - 'is_recent_past': is_recent_past, - 'is_live_candidate': is_today or is_future or is_recent_past, } except (ValueError, OverflowError) as e: self.report_warning(f"Failed to parse meeting date/time: {e}") @@ -225,11 +208,11 @@ class EuroParlWebstreamIE(InfoExtractor): return { 'start_timestamp': current_time - 86400, # 24 hours ago 'end_timestamp': current_time, - 'is_live_candidate': True, # Assume it might be live if we can't parse the time } def _find_m3u8_in_webpage(self, webpage): """Look for m3u8 URLs directly in the webpage.""" + # Look for direct m3u8 URLs with timestamps m3u8_matches = re.findall( r'[\'"]((https?://live\.media\.eup\.glcloud\.eu/[^"\']+\.m3u8(?:\?[^\'"]*)?)[\'"])', webpage @@ -252,40 +235,6 @@ class EuroParlWebstreamIE(InfoExtractor): title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip() return title - def _check_is_live(self, webpage): - """Check if the stream is likely to be live based on webpage content.""" - live_indicators = [ - r'(?i)live\s+now', - r'(?i)streaming\s+live', - r'(?i)watch\s+live', - r'(?i)live\s+stream', - r'(?i)currently\s+live', - r'(?i)livestream', - r'isLive\s*[:=]\s*true', - r'"isLive"\s*:\s*true', - r'data-is-live\s*=\s*["\'](true|1)["\']', - ] - - for indicator in live_indicators: - if re.search(indicator, webpage): - return True - - return False - - def _try_url(self, url, display_id): - """Try a single URL and return formats and subtitles if successful.""" - try: - self.to_screen(f"Trying URL: {url}") - fmt, subs = self._extract_m3u8_formats_and_subtitles( - url, display_id, 'mp4', m3u8_id='hls', fatal=False) - - if fmt: - return fmt, subs - except ExtractorError as e: - self.report_warning(f"Failed with URL {url}: {e}") - - return None, None - def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') @@ -312,104 +261,102 @@ class EuroParlWebstreamIE(InfoExtractor): webpage = self._download_webpage(url, display_id) title = self._extract_title_from_webpage(webpage) - # Check if this is likely to be a live stream - is_live_page = self._check_is_live(webpage) - # First, look for m3u8 URLs directly in the page direct_urls = self._find_m3u8_in_webpage(webpage) if direct_urls: self.to_screen(f"Found {len(direct_urls)} potential stream URLs in webpage") for m3u8_url in direct_urls: - formats, subtitles = self._try_url(m3u8_url, display_id) - if formats: - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live_page, - } + try: + self.to_screen(f"Trying direct URL: {m3u8_url}") + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + + if formats: + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + } + except ExtractorError as e: + self.report_warning(f"Failed with direct URL {m3u8_url}: {e}") - # Parse the meeting ID and check if this is potentially a live stream + # If no direct URLs found, parse the meeting ID and generate likely timestamps meeting_info = self._parse_meeting_id(display_id) start_timestamp = meeting_info.get('start_timestamp') end_timestamp = meeting_info.get('end_timestamp') - is_live_candidate = meeting_info.get('is_live_candidate', False) or is_live_page self.to_screen(f"Generated timestamps for meeting: start={start_timestamp}, end={end_timestamp}") - self.to_screen(f"Stream is likely {'live' if is_live_candidate else 'archived'}") - # First check for live streams if this is a live candidate - if is_live_candidate: - self.to_screen("Checking for live stream URLs first") - - for endpoint in self._ENDPOINTS[:2]: # Only try the first two endpoints for live - for channel in self._CHANNELS[:3]: # Only try the top 3 channels for live - for stream_type in self._LIVE_STREAM_IDS: - # For live streams, try URLs without timestamps - live_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8" - formats, subtitles = self._try_url(live_url, display_id) - - if formats: - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': True, - } + # Try a variety of possibilities, starting with the most likely combinations + formats = [] + subtitles = {} + working_url = None - # Try archived streams with prioritized channels - for channel in self._CHANNELS: - for stream_type in self._ARCHIVE_STREAM_IDS: - # For archived content, include timestamps - archive_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - formats, subtitles = self._try_url(archive_url, display_id) + # Main endpoint with prioritized channels + for channel in self.PRIORITIZED_CHANNELS: + for stream_type in self.KNOWN_STREAM_IDS: + candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + self.to_screen(f"Trying URL: {candidate_url}") - if formats: - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': False, - } - - # If main endpoint + prioritized channels didn't work, try other endpoints - for endpoint in self._ENDPOINTS[1:]: - for channel in self._CHANNELS[:3]: # Only try the top 3 channels for other endpoints - for stream_type in self._ARCHIVE_STREAM_IDS: - archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - formats, subtitles = self._try_url(archive_url, display_id) + try: + fmt, subs = self._extract_m3u8_formats_and_subtitles( + candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - if formats: + if fmt: + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) + working_url = candidate_url + self.to_screen(f"Success! Found working URL: {working_url}") + return { 'id': display_id, 'display_id': display_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'is_live': False, } + except ExtractorError as e: + self.report_warning(f"Failed with URL {candidate_url}: {e}") + + # If main endpoint + prioritized channels didn't work, try other endpoints + for endpoint in self.KNOWN_ENDPOINTS[1:]: # Skip the first one as we already tried it + for channel in self.PRIORITIZED_CHANNELS[:3]: # Only try the top 3 channels for other endpoints + for stream_type in self.KNOWN_STREAM_IDS: + candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + self.to_screen(f"Trying URL: {candidate_url}") + + try: + fmt, subs = self._extract_m3u8_formats_and_subtitles( + candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + + if fmt: + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) + working_url = candidate_url + self.to_screen(f"Success! Found working URL: {working_url}") + + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + } + except ExtractorError as e: + self.report_warning(f"Failed with URL {candidate_url}: {e}") # If we've reached here, we need to give a helpful error message parsed_date = f"{meeting_info.get('date', 'unknown-date')}" parsed_time = f"{meeting_info.get('time', 'unknown-time')}" - # Provide different suggestions based on whether it's likely live or archived - if is_live_candidate: - suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8" - suggestion_text = f"For live streams, try: yt-dlp \"{suggested_url}\"" - else: - suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - suggestion_text = f"For archived content, try: yt-dlp \"{suggested_url}\"" + # Provide the most likely URL for manual use + suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" raise ExtractorError( f"Could not extract stream URL for {display_id}. The European Parliament stream may not be available.\n" - f"Attempted to find a {'live' if is_live_candidate else 'archived'} stream for date: {parsed_date}, time: {parsed_time}.\n" - f"{suggestion_text}", + f"Attempted to find a stream for date: {parsed_date}, time: {parsed_time}.\n" + f"Try using yt-dlp directly with: yt-dlp \"{suggested_url}\"", expected=True ) From 43ba015d276edaa80168a05b247b2e33e2954302 Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Wed, 26 Mar 2025 15:54:51 +0000 Subject: [PATCH 3/9] Update europa.py --- yt_dlp/extractor/europa.py | 310 ++++++++++++++++++------------------- 1 file changed, 155 insertions(+), 155 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index c3a03bf591..8dd0ddcd88 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -103,7 +103,7 @@ class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?: multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| - live\.media\.eup\.glcloud\.eu/hls/live/(?P\d+)/(?:channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w-]+)(?:\.m3u8|/master\.m3u8) + live\.media\.eup\.glcloud\.eu/hls/live/(?P\d+)/(?Pchannel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w-]+)(?:\.m3u8|/master\.m3u8) ) ''' _TESTS = [{ @@ -118,7 +118,7 @@ class EuroParlWebstreamIE(InfoExtractor): 'skip_download': True, }, }, { - # New URL format for direct HLS streams + # Direct HLS stream URL 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime=1742828675&endTime=1742832870', 'info_dict': { 'id': 'index-archive', @@ -128,187 +128,144 @@ class EuroParlWebstreamIE(InfoExtractor): 'params': { 'skip_download': True, }, - }, { - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/special-committee-on-housing-crisis-in-european-union-ordinary-meeting_20250324-1500-COMMITTEE-HOUS', - 'info_dict': { - 'id': '20250324-1500-COMMITTEE-HOUS', - 'display_id': '20250324-1500-COMMITTEE-HOUS', - 'ext': 'mp4', - 'title': 'Special committee on the Housing Crisis in the European Union Ordinary meeting', - }, - 'params': { - 'skip_download': True, - }, }] - # Known working stream IDs (in order of likely success) - KNOWN_STREAM_IDS = [ - "index-archive", - "norsk-archive", - ] + # Main CDN endpoint - primarily target this instead of trying multiple + MAIN_ENDPOINT = "2113753" + + # Priority channels based on observed success rates + PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-bxl", "channel-10-bxl"] + + # Default stream types by content type + LIVE_STREAM_TYPES = ["index", "master", "playlist"] + ARCHIVE_STREAM_TYPES = ["index-archive", "norsk-archive", "index", "master"] - # Known CDN endpoints (in order of likely success) - KNOWN_ENDPOINTS = [ - "2113753", # This appears to be the main endpoint - "2113749", - "2113750", - "2113751", - "2113752", - "2113754", - ] + def _extract_direct_url_from_webpage(self, webpage): + """Extract direct m3u8 URLs from webpage with minimal logging""" + m3u8_urls = [] + + # Search patterns for m3u8 URLs + for pattern in [ + r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\']*)?)["\']', + r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', + r'=[^\n]*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']', + ]: + matches = re.findall(pattern, webpage) + if matches: + m3u8_urls.extend(matches) + + # Clean up URLs + clean_urls = [] + for url in m3u8_urls: + # Remove any JS string escaping + url = url.replace('\\/', '/').replace('\\\\', '\\') + clean_urls.append(url) + + # Extract from network panel if available + network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE) + if network_url_match: + clean_urls.append(network_url_match.group(1)) + + return clean_urls - # Prioritized channel list based on observations (channel-07-bxl is often used) - PRIORITIZED_CHANNELS = [ - "channel-07-bxl", # Most common based on examples - "channel-03-bxl", # Also seen in examples - "channel-01-bxl", - "channel-02-bxl", - "channel-04-bxl", - "channel-05-bxl", - "channel-06-bxl", - "channel-08-bxl", - "channel-09-bxl", - "channel-10-bxl", - ] + def _extract_title_from_webpage(self, webpage, display_id): + """Extract the title from the webpage""" + # Try different patterns to extract the title + for pattern in [ + r'([^<]+)', + r']*>([^<]+)', + r'"title"\s*:\s*"([^"]+)"', + ]: + title_match = re.search(pattern, webpage) + if title_match: + title = title_match.group(1).strip() + # Clean up common suffixes + title = re.sub(r'\s*\|\s*European Parliament$', '', title) + title = re.sub(r'\s*-\s*Multimedia Centre$', '', title) + return title + + return f"European Parliament Session - {display_id}" - def _parse_meeting_id(self, display_id): - """Extract date and time information from the meeting ID.""" - # Format: YYYYMMDD-HHMM-COMMITTEE-TYPE + def _parse_meeting_date(self, display_id): + """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)""" date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id) if date_match: date_str, time_str, meeting_type = date_match.groups() try: - # Parse the date and time + # Parse the date components year = int(date_str[:4]) month = int(date_str[4:6]) day = int(date_str[6:8]) hour = int(time_str[:2]) minute = int(time_str[2:4]) - # Create datetime object + # Create timestamps with a generous window (3 hours before and after) meeting_dt = datetime.datetime(year, month, day, hour, minute) + start_dt = meeting_dt - datetime.timedelta(hours=3) + end_dt = meeting_dt + datetime.timedelta(hours=6) - # Calculate a reasonable meeting duration (2 hours by default) - end_dt = meeting_dt + datetime.timedelta(hours=2) + # Convert to timestamps + start_ts = int(start_dt.timestamp()) + end_ts = int(end_dt.timestamp()) - return { - 'date': date_str, - 'time': time_str, - 'type': meeting_type, - 'start_dt': meeting_dt, - 'end_dt': end_dt, - 'start_timestamp': int(meeting_dt.timestamp()), - 'end_timestamp': int(end_dt.timestamp()), - } - except (ValueError, OverflowError) as e: - self.report_warning(f"Failed to parse meeting date/time: {e}") - - # If we can't parse the date/time, use the current time minus 24 hours to now - current_time = int(time.time()) - return { - 'start_timestamp': current_time - 86400, # 24 hours ago - 'end_timestamp': current_time, - } - - def _find_m3u8_in_webpage(self, webpage): - """Look for m3u8 URLs directly in the webpage.""" - # Look for direct m3u8 URLs with timestamps - m3u8_matches = re.findall( - r'[\'"]((https?://live\.media\.eup\.glcloud\.eu/[^"\']+\.m3u8(?:\?[^\'"]*)?)[\'"])', - webpage - ) - if m3u8_matches: - return [url[0].replace('\\/', '/').replace('\\\\', '\\') for url in m3u8_matches] - - return [] - - def _extract_title_from_webpage(self, webpage): - """Extract the title from the webpage.""" - title = self._html_search_regex( - r'([^<]+)', - webpage, 'title', default='European Parliament Stream') + return start_ts, end_ts + + except (ValueError, OverflowError): + pass - # Clean up title - title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip() - return title + # Fallback to a recent 48-hour window + now = int(time.time()) + start_time = now - (48 * 3600) # 48 hours ago + return start_time, now def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') live_id = mobj.group('live_id') stream_id = mobj.group('stream_id') + channel = mobj.group('channel') - # Handle direct HLS stream URLs + # Handle direct HLS URLs if live_id and stream_id: - # Strip any query parameters from stream_id - if '?' in stream_id: - stream_id = stream_id.split('?')[0] + # Remove query parameters from stream_id if present + clean_stream_id = stream_id.split('?')[0] if '?' in stream_id else stream_id formats, subtitles = self._extract_m3u8_formats_and_subtitles( - url, stream_id, 'mp4', m3u8_id='hls', fatal=False) - + url, clean_stream_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) + return { - 'id': stream_id, + 'id': clean_stream_id, 'title': 'European Parliament Stream', 'formats': formats, 'subtitles': subtitles, } - # If we're dealing with a europarl.europa.eu URL, download the webpage first + # Download the webpage for standard europarl URLs webpage = self._download_webpage(url, display_id) - title = self._extract_title_from_webpage(webpage) - # First, look for m3u8 URLs directly in the page - direct_urls = self._find_m3u8_in_webpage(webpage) - if direct_urls: - self.to_screen(f"Found {len(direct_urls)} potential stream URLs in webpage") - for m3u8_url in direct_urls: - try: - self.to_screen(f"Trying direct URL: {m3u8_url}") - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - - if formats: - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - } - except ExtractorError as e: - self.report_warning(f"Failed with direct URL {m3u8_url}: {e}") + # Check for live indicators + is_live = bool(re.search(r'(?:isLive|livestream|live-stream|\"live\"\s*:\s*true)', webpage, re.IGNORECASE)) - # If no direct URLs found, parse the meeting ID and generate likely timestamps - meeting_info = self._parse_meeting_id(display_id) - start_timestamp = meeting_info.get('start_timestamp') - end_timestamp = meeting_info.get('end_timestamp') + # Extract title + title = self._extract_title_from_webpage(webpage, display_id) - self.to_screen(f"Generated timestamps for meeting: start={start_timestamp}, end={end_timestamp}") + # First try direct URLs from the webpage (this is the most reliable approach) + direct_urls = self._extract_direct_url_from_webpage(webpage) - # Try a variety of possibilities, starting with the most likely combinations + # Track whether we successfully found a stream formats = [] subtitles = {} - working_url = None - # Main endpoint with prioritized channels - for channel in self.PRIORITIZED_CHANNELS: - for stream_type in self.KNOWN_STREAM_IDS: - candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - self.to_screen(f"Trying URL: {candidate_url}") - + if direct_urls: + for m3u8_url in direct_urls: try: fmt, subs = self._extract_m3u8_formats_and_subtitles( - candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False) if fmt: formats.extend(fmt) self._merge_subtitles(subs, target=subtitles) - working_url = candidate_url - self.to_screen(f"Success! Found working URL: {working_url}") return { 'id': display_id, @@ -316,26 +273,58 @@ class EuroParlWebstreamIE(InfoExtractor): 'title': title, 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } - except ExtractorError as e: - self.report_warning(f"Failed with URL {candidate_url}: {e}") + except ExtractorError: + pass + + # Parse timestamps for archive retrieval (or use current time for live) + if is_live: + # For live streams, we don't need timestamps + start_timestamp, end_timestamp = None, None + else: + start_timestamp, end_timestamp = self._parse_meeting_date(display_id) - # If main endpoint + prioritized channels didn't work, try other endpoints - for endpoint in self.KNOWN_ENDPOINTS[1:]: # Skip the first one as we already tried it - for channel in self.PRIORITIZED_CHANNELS[:3]: # Only try the top 3 channels for other endpoints - for stream_type in self.KNOWN_STREAM_IDS: - candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - self.to_screen(f"Trying URL: {candidate_url}") + # Use appropriate stream types for the content type + stream_types = self.LIVE_STREAM_TYPES if is_live else self.ARCHIVE_STREAM_TYPES + + # Try combinations with improved targeting + for channel in self.PRIORITY_CHANNELS: + for stream_type in stream_types: + # For live streams, try without timestamps first + if is_live: + live_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8" + + try: + fmt, subs = self._extract_m3u8_formats_and_subtitles( + live_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + + if fmt: + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + except ExtractorError: + pass + + # For archived content (or as fallback for live), try with timestamps + if start_timestamp and end_timestamp: + archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" try: fmt, subs = self._extract_m3u8_formats_and_subtitles( - candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + archive_url, display_id, 'mp4', m3u8_id='hls', fatal=False) if fmt: formats.extend(fmt) self._merge_subtitles(subs, target=subtitles) - working_url = candidate_url - self.to_screen(f"Success! Found working URL: {working_url}") return { 'id': display_id, @@ -343,20 +332,31 @@ class EuroParlWebstreamIE(InfoExtractor): 'title': title, 'formats': formats, 'subtitles': subtitles, + 'is_live': False, } - except ExtractorError as e: - self.report_warning(f"Failed with URL {candidate_url}: {e}") + except ExtractorError: + pass + + # Provide helpful error with the most likely working URLs + suggested_urls = [] - # If we've reached here, we need to give a helpful error message - parsed_date = f"{meeting_info.get('date', 'unknown-date')}" - parsed_time = f"{meeting_info.get('time', 'unknown-time')}" + # Add the URLs that are most likely to work based on the logs and screenshots + if start_timestamp and end_timestamp: + suggested_urls.extend([ + f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}", + f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + ]) + else: + suggested_urls.extend([ + f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index.m3u8", + f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index.m3u8" + ]) - # Provide the most likely URL for manual use - suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + suggestions = "\n".join([f"yt-dlp \"{url}\"" for url in suggested_urls]) raise ExtractorError( - f"Could not extract stream URL for {display_id}. The European Parliament stream may not be available.\n" - f"Attempted to find a stream for date: {parsed_date}, time: {parsed_time}.\n" - f"Try using yt-dlp directly with: yt-dlp \"{suggested_url}\"", + f"Could not extract stream URL for {display_id or url}. The European Parliament stream may not be available.\n" + f"Live stream detected: {is_live}\n" + f"Try using yt-dlp directly with one of these URLs:\n{suggestions}", expected=True ) From fe08c6ca27701c8199300f48c9320fd4f20584fb Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Fri, 28 Mar 2025 12:49:15 +0000 Subject: [PATCH 4/9] Update europa.py --- yt_dlp/extractor/europa.py | 351 ++++++++++++++++++++++--------------- 1 file changed, 205 insertions(+), 146 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 8dd0ddcd88..b40d393a79 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -1,3 +1,4 @@ +# coding: utf-8 from .common import InfoExtractor from ..utils import ( int_or_none, @@ -103,7 +104,7 @@ class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?: multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| - live\.media\.eup\.glcloud\.eu/hls/live/(?P\d+)/(?Pchannel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w-]+)(?:\.m3u8|/master\.m3u8) + live\.media\.eup\.glcloud\.eu/hls/live/(?P[\w-]+)/(?Pchannel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows ) ''' _TESTS = [{ @@ -118,10 +119,21 @@ class EuroParlWebstreamIE(InfoExtractor): 'skip_download': True, }, }, { - # Direct HLS stream URL - 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime=1742828675&endTime=1742832870', + # Direct HLS stream URL (archive example similar to user provided) + 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442', 'info_dict': { - 'id': 'index-archive', + 'id': 'norsk-archive', # ID derived from filename before query + 'ext': 'mp4', + 'title': 'European Parliament Stream', + }, + 'params': { + 'skip_download': True, + }, + },{ + # Direct HLS stream URL (live example) + 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8', + 'info_dict': { + 'id': 'index', 'ext': 'mp4', 'title': 'European Parliament Stream', }, @@ -130,43 +142,53 @@ class EuroParlWebstreamIE(InfoExtractor): }, }] - # Main CDN endpoint - primarily target this instead of trying multiple - MAIN_ENDPOINT = "2113753" - - # Priority channels based on observed success rates - PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-bxl", "channel-10-bxl"] - - # Default stream types by content type - LIVE_STREAM_TYPES = ["index", "master", "playlist"] - ARCHIVE_STREAM_TYPES = ["index-archive", "norsk-archive", "index", "master"] + # Known CDN endpoints - try these if direct extraction fails + # Added 2113713 and 2113713-b based on user's M3U8 + ENDPOINTS = ["2113753", "2113713", "2113713-b"] + + # Priority channels based on observed success rates & user M3U8 + # Added channel-01-stb + PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"] + + # Default stream types/filenames by content type + # These are used in the *fallback* guessing logic. + # The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed. + LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"] + ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"] def _extract_direct_url_from_webpage(self, webpage): """Extract direct m3u8 URLs from webpage with minimal logging""" - m3u8_urls = [] - + m3u8_urls = set() # Use a set to avoid duplicates + # Search patterns for m3u8 URLs + # Added more flexibility for quotes and paths for pattern in [ - r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\']*)?)["\']', + r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']', r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', - r'=[^\n]*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']', + # Look for assignments or attributes + r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']', + # Look for URLs within JSON-like structures in script tags + r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', + r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', ]: matches = re.findall(pattern, webpage) - if matches: - m3u8_urls.extend(matches) - - # Clean up URLs - clean_urls = [] - for url in m3u8_urls: - # Remove any JS string escaping - url = url.replace('\\/', '/').replace('\\\\', '\\') - clean_urls.append(url) - - # Extract from network panel if available + for match in matches: + # Handle potential tuple results from findall if multiple groups exist in regex + url_match = match if isinstance(match, str) else match[0] + # Basic sanity check + if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match: + # Remove any JS string escaping + url_match = url_match.replace('\\/', '/').replace('\\\\', '\\') + m3u8_urls.add(url_match) + + # Extract from network panel if available (less reliable parsing) network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE) if network_url_match: - clean_urls.append(network_url_match.group(1)) - - return clean_urls + url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\') + m3u8_urls.add(url_match) + + self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage') + return list(m3u8_urls) def _extract_title_from_webpage(self, webpage, display_id): """Extract the title from the webpage""" @@ -174,6 +196,7 @@ class EuroParlWebstreamIE(InfoExtractor): for pattern in [ r'([^<]+)', + r']*class="erpl_title-h1"[^>]*>([^<]+)', # Specific title class r']*>([^<]+)', r'"title"\s*:\s*"([^"]+)"', ]: @@ -181,17 +204,18 @@ class EuroParlWebstreamIE(InfoExtractor): if title_match: title = title_match.group(1).strip() # Clean up common suffixes - title = re.sub(r'\s*\|\s*European Parliament$', '', title) - title = re.sub(r'\s*-\s*Multimedia Centre$', '', title) - return title - - return f"European Parliament Session - {display_id}" + title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip() + title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip() + if title: + return title + + return f"European Parliament Session - {display_id}" # Fallback title def _parse_meeting_date(self, display_id): """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)""" date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id) if date_match: - date_str, time_str, meeting_type = date_match.groups() + date_str, time_str, _ = date_match.groups() try: # Parse the date components year = int(date_str[:4]) @@ -199,164 +223,199 @@ class EuroParlWebstreamIE(InfoExtractor): day = int(date_str[6:8]) hour = int(time_str[:2]) minute = int(time_str[2:4]) - - # Create timestamps with a generous window (3 hours before and after) - meeting_dt = datetime.datetime(year, month, day, hour, minute) + + # Create timestamps with a generous window (e.g., 3 hours before, 6 hours after) + # This helps catch streams that start slightly early or run long + meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC start_dt = meeting_dt - datetime.timedelta(hours=3) - end_dt = meeting_dt + datetime.timedelta(hours=6) - - # Convert to timestamps + end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window + + # Convert to Unix timestamps start_ts = int(start_dt.timestamp()) end_ts = int(end_dt.timestamp()) - + + self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}') return start_ts, end_ts - - except (ValueError, OverflowError): - pass - - # Fallback to a recent 48-hour window + + except (ValueError, OverflowError) as e: + self.to_screen(f'Error parsing date from display_id "{display_id}": {e}') + pass # Fall through to fallback + + # Fallback to a recent window if parsing fails or ID format is different + self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.') now = int(time.time()) - start_time = now - (48 * 3600) # 48 hours ago - return start_time, now + start_time = now - (24 * 3600) # 24 hours ago (might be too short for older archives) + end_time = now + (1 * 3600) # 1 hour in the future (for live/recent) + return start_time, end_time def _real_extract(self, url): mobj = self._match_valid_url(url) + # Get potential IDs from the regex match groups display_id = mobj.group('id') live_id = mobj.group('live_id') stream_id = mobj.group('stream_id') channel = mobj.group('channel') - # Handle direct HLS URLs - if live_id and stream_id: - # Remove query parameters from stream_id if present - clean_stream_id = stream_id.split('?')[0] if '?' in stream_id else stream_id - + # Use the most specific ID available + video_id = display_id or stream_id or live_id or channel + + # Handle direct HLS URLs first (most reliable if provided) + if live_id and (stream_id or channel): + # Clean up stream_id (remove query parameters for use as info dict id) + clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id + # If stream_id is missing but channel exists, use channel as part of the id + final_id = clean_stream_id or channel or 'unknown_stream' + # Remove potential .m3u8 suffix for cleaner ID + if final_id.endswith('.m3u8'): + final_id = final_id[:-5] + + self.to_screen(f'Processing direct HLS URL: {url}') formats, subtitles = self._extract_m3u8_formats_and_subtitles( - url, clean_stream_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) - + url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues + + if not formats: + self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}') + # Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work + # raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True) + return { - 'id': clean_stream_id, - 'title': 'European Parliament Stream', - 'formats': formats, - 'subtitles': subtitles, + 'id': final_id, + 'title': 'European Parliament Stream', # Generic title for direct URLs + 'formats': formats or [], + 'subtitles': subtitles or {}, + 'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL } - # Download the webpage for standard europarl URLs + # --- Fallback for multimedia.europarl.europa.eu URLs --- + if not display_id: # Should have display_id if it's not a direct HLS URL + raise ExtractorError('Failed to identify video ID from URL.') + + self.to_screen(f'Processing webpage URL: {url}') webpage = self._download_webpage(url, display_id) - - # Check for live indicators - is_live = bool(re.search(r'(?:isLive|livestream|live-stream|\"live\"\s*:\s*true)', webpage, re.IGNORECASE)) - + + # Check for live indicators more reliably + # Look for common live indicators in JS, classes, or text + is_live = bool(re.search( + r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)', + webpage, + re.IGNORECASE)) + self.to_screen(f'Detected as live: {is_live}') + # Extract title title = self._extract_title_from_webpage(webpage, display_id) - - # First try direct URLs from the webpage (this is the most reliable approach) + + # *** Strategy 1: Extract direct URLs from webpage (Preferred) *** direct_urls = self._extract_direct_url_from_webpage(webpage) - - # Track whether we successfully found a stream formats = [] subtitles = {} - + if direct_urls: + self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...') for m3u8_url in direct_urls: + # Clean stream ID from URL for format identification + m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0] + if m3u8_stream_id.endswith('.m3u8'): + m3u8_stream_id = m3u8_stream_id[:-5] + try: fmt, subs = self._extract_m3u8_formats_and_subtitles( - m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - + m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error + if fmt: + self.to_screen(f'Successfully extracted formats from: {m3u8_url}') formats.extend(fmt) self._merge_subtitles(subs, target=subtitles) - + # If we found formats, we are likely done, return immediately return { 'id': display_id, 'display_id': display_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'is_live': is_live, + 'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL } - except ExtractorError: - pass - - # Parse timestamps for archive retrieval (or use current time for live) - if is_live: - # For live streams, we don't need timestamps - start_timestamp, end_timestamp = None, None + else: + self.to_screen(f'No formats found in: {m3u8_url}') + except ExtractorError as e: + self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}') + pass # Try the next direct URL else: - start_timestamp, end_timestamp = self._parse_meeting_date(display_id) - - # Use appropriate stream types for the content type - stream_types = self.LIVE_STREAM_TYPES if is_live else self.ARCHIVE_STREAM_TYPES - - # Try combinations with improved targeting - for channel in self.PRIORITY_CHANNELS: - for stream_type in stream_types: - # For live streams, try without timestamps first - if is_live: - live_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8" - - try: - fmt, subs = self._extract_m3u8_formats_and_subtitles( - live_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - - if fmt: - formats.extend(fmt) - self._merge_subtitles(subs, target=subtitles) - - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': True, - } - except ExtractorError: - pass - - # For archived content (or as fallback for live), try with timestamps - if start_timestamp and end_timestamp: - archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - + self.to_screen('No direct M3U8 URLs found in webpage.') + + + # *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) *** + self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...') + + # Parse timestamps for archive retrieval (or use a window for live/unknown) + # Always parse, even if live, as it might be a recently finished live event + start_timestamp, end_timestamp = self._parse_meeting_date(display_id) + + # Use appropriate stream filenames for the content type + stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES + + # Try combinations with updated endpoints and channels + for endpoint in self.ENDPOINTS: + for channel_to_try in self.PRIORITY_CHANNELS: + for filename in stream_filenames: + base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}" + + # Determine if timestamps should be added + # Add timestamps if it's explicitly not live, OR if the filename suggests archive, + # OR if start/end timestamps were successfully parsed from the ID. + # Avoid timestamps for clearly live filenames unless forced by non-live status. + use_timestamps = ( + (not is_live or 'archive' in filename.lower()) + and start_timestamp and end_timestamp + ) + + test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url + try: + self.to_screen(f'Trying guessed URL: {test_url}') fmt, subs = self._extract_m3u8_formats_and_subtitles( - archive_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - + test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False) + if fmt: + self.to_screen(f'Success with guessed URL: {test_url}') formats.extend(fmt) self._merge_subtitles(subs, target=subtitles) - + # Found a working combination return { 'id': display_id, 'display_id': display_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'is_live': False, + 'is_live': not use_timestamps, # If we used timestamps, assume not live } - except ExtractorError: - pass - - # Provide helpful error with the most likely working URLs - suggested_urls = [] - - # Add the URLs that are most likely to work based on the logs and screenshots + else: + self.to_screen(f'No formats found in guessed URL: {test_url}') + + except ExtractorError as e: + # Log error lightly, as many guesses are expected to fail + self.to_screen(f'Guessed URL failed: {test_url} ({e})') + pass # Continue trying other combinations + + # *** If all strategies fail *** + self.to_screen('All extraction strategies failed.') + + # Provide helpful error with suggestions + error_message = ( + f"Could not extract stream URL for {display_id or url}. " + "The stream may be old, expired, or use an unsupported format.\n" + f"Live status detected: {is_live}\n" + "Common issues:\n" + "- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n" + "- The event might not be available via the standard CDN endpoints/channels.\n" + "If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n" + "Example (using parsed times, adjust if needed):\n" + ) if start_timestamp and end_timestamp: - suggested_urls.extend([ - f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}", - f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - ]) + example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + error_message += f'yt-dlp "{example_url}"' else: - suggested_urls.extend([ - f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index.m3u8", - f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index.m3u8" - ]) - - suggestions = "\n".join([f"yt-dlp \"{url}\"" for url in suggested_urls]) - - raise ExtractorError( - f"Could not extract stream URL for {display_id or url}. The European Parliament stream may not be available.\n" - f"Live stream detected: {is_live}\n" - f"Try using yt-dlp directly with one of these URLs:\n{suggestions}", - expected=True - ) + example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8" + error_message += f'yt-dlp "{example_url}"' + + + raise ExtractorError(error_message, expected=True) From 6e3ddbbe4dacd39bd9ee0c13be457c90a37687aa Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Fri, 28 Mar 2025 13:08:14 +0000 Subject: [PATCH 5/9] Update europa.py --- yt_dlp/extractor/europa.py | 410 ++++++++++--------------------------- 1 file changed, 107 insertions(+), 303 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index b40d393a79..cd4cbf4dfd 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import ( int_or_none, @@ -10,15 +9,7 @@ from ..utils import ( traverse_obj, unified_strdate, xpath_text, - ExtractorError, - js_to_json, - urljoin ) -import re -import json -import time -import datetime - class EuropaIE(InfoExtractor): _WORKING = False @@ -54,7 +45,10 @@ class EuropaIE(InfoExtractor): def get_item(type_, preference): items = {} for item in playlist.findall(f'./info/{type_}/item'): - lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) + lang, label = ( + xpath_text(item, 'lg', default=None), + xpath_text(item, 'label', default=None) + ) if lang and label: items[lang] = label.strip() for p in preference: @@ -63,7 +57,6 @@ class EuropaIE(InfoExtractor): query = parse_qs(url) preferred_lang = query.get('sitelang', ('en', ))[0] - preferred_langs = orderedSet((preferred_lang, 'en', 'int')) title = get_item('title', preferred_langs) or video_id @@ -102,320 +95,131 @@ class EuropaIE(InfoExtractor): class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?: - multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| - live\.media\.eup\.glcloud\.eu/hls/live/(?P[\w-]+)/(?Pchannel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows - ) + https?://multimedia\.europarl\.europa\.eu/ + (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) ''' _TESTS = [{ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { - 'id': '20220914-0900-PLENARY', + 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY', 'ext': 'mp4', 'title': 'Plenary session', + 'release_timestamp': 1663139069, + 'release_date': '20220914', }, 'params': { 'skip_download': True, }, }, { - # Direct HLS stream URL (archive example similar to user provided) - 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442', + # example of old live webstream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', 'info_dict': { - 'id': 'norsk-archive', # ID derived from filename before query 'ext': 'mp4', - 'title': 'European Parliament Stream', - }, - 'params': { - 'skip_download': True, - }, - },{ - # Direct HLS stream URL (live example) - 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8', - 'info_dict': { - 'id': 'index', - 'ext': 'mp4', - 'title': 'European Parliament Stream', - }, - 'params': { - 'skip_download': True, + 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715', + 'release_timestamp': 1668502800, + 'title': 'Euroscola 2022-11-15 19:21', + 'release_date': '20221115', + 'live_status': 'is_live', }, + 'skip': 'not live anymore', }] - # Known CDN endpoints - try these if direct extraction fails - # Added 2113713 and 2113713-b based on user's M3U8 - ENDPOINTS = ["2113753", "2113713", "2113713-b"] - - # Priority channels based on observed success rates & user M3U8 - # Added channel-01-stb - PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"] - - # Default stream types/filenames by content type - # These are used in the *fallback* guessing logic. - # The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed. - LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"] - ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"] - - def _extract_direct_url_from_webpage(self, webpage): - """Extract direct m3u8 URLs from webpage with minimal logging""" - m3u8_urls = set() # Use a set to avoid duplicates - - # Search patterns for m3u8 URLs - # Added more flexibility for quotes and paths - for pattern in [ - r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']', - r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', - # Look for assignments or attributes - r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']', - # Look for URLs within JSON-like structures in script tags - r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', - r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', - ]: - matches = re.findall(pattern, webpage) - for match in matches: - # Handle potential tuple results from findall if multiple groups exist in regex - url_match = match if isinstance(match, str) else match[0] - # Basic sanity check - if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match: - # Remove any JS string escaping - url_match = url_match.replace('\\/', '/').replace('\\\\', '\\') - m3u8_urls.add(url_match) - - # Extract from network panel if available (less reliable parsing) - network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE) - if network_url_match: - url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\') - m3u8_urls.add(url_match) - - self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage') - return list(m3u8_urls) - - def _extract_title_from_webpage(self, webpage, display_id): - """Extract the title from the webpage""" - # Try different patterns to extract the title - for pattern in [ - r'([^<]+)', - r']*class="erpl_title-h1"[^>]*>([^<]+)', # Specific title class - r']*>([^<]+)', - r'"title"\s*:\s*"([^"]+)"', - ]: - title_match = re.search(pattern, webpage) - if title_match: - title = title_match.group(1).strip() - # Clean up common suffixes - title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip() - title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip() - if title: - return title - - return f"European Parliament Session - {display_id}" # Fallback title - - def _parse_meeting_date(self, display_id): - """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)""" - date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id) - if date_match: - date_str, time_str, _ = date_match.groups() - try: - # Parse the date components - year = int(date_str[:4]) - month = int(date_str[4:6]) - day = int(date_str[6:8]) - hour = int(time_str[:2]) - minute = int(time_str[2:4]) - - # Create timestamps with a generous window (e.g., 3 hours before, 6 hours after) - # This helps catch streams that start slightly early or run long - meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC - start_dt = meeting_dt - datetime.timedelta(hours=3) - end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window - - # Convert to Unix timestamps - start_ts = int(start_dt.timestamp()) - end_ts = int(end_dt.timestamp()) - - self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}') - return start_ts, end_ts - - except (ValueError, OverflowError) as e: - self.to_screen(f'Error parsing date from display_id "{display_id}": {e}') - pass # Fall through to fallback - - # Fallback to a recent window if parsing fails or ID format is different - self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.') - now = int(time.time()) - start_time = now - (24 * 3600) # 24 hours ago (might be too short for older archives) - end_time = now + (1 * 3600) # 1 hour in the future (for live/recent) - return start_time, end_time - def _real_extract(self, url): - mobj = self._match_valid_url(url) - # Get potential IDs from the regex match groups - display_id = mobj.group('id') - live_id = mobj.group('live_id') - stream_id = mobj.group('stream_id') - channel = mobj.group('channel') - - # Use the most specific ID available - video_id = display_id or stream_id or live_id or channel - - # Handle direct HLS URLs first (most reliable if provided) - if live_id and (stream_id or channel): - # Clean up stream_id (remove query parameters for use as info dict id) - clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id - # If stream_id is missing but channel exists, use channel as part of the id - final_id = clean_stream_id or channel or 'unknown_stream' - # Remove potential .m3u8 suffix for cleaner ID - if final_id.endswith('.m3u8'): - final_id = final_id[:-5] - - self.to_screen(f'Processing direct HLS URL: {url}') - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - if not formats: - self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}') - # Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work - # raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True) + # Try to parse Next.js data for metadata + nextjs = self._search_nextjs_data(webpage, display_id, default={}) + page_props = traverse_obj(nextjs, ('props', 'pageProps'), default={}) + media_info = page_props.get('mediaItem') or {} # Look for start/end times here for archives? + + title = media_info.get('title') or media_info.get('name') or display_id + release_timestamp = None + # Existing logic uses startDateTime, might need adjustment for archive start/end + if 'startDateTime' in media_info: + release_timestamp = parse_iso8601(media_info['startDateTime']) + + # Determine if it's Live or VOD/Archive (might need refinement) + # mediaSubType might be 'Live' or 'VOD' or something else + is_live = media_info.get('mediaSubType') == 'Live' + + # Search for any .m3u8 link first + m3u8_links = self._search_regex( + r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', + webpage, 'm3u8 URL', default=None, group=1, fatal=False + ) + # --- Potential modification area START --- + # If it's NOT live, and we have start/end times, and m3u8_links points to a live URL, + # try constructing the index-archive.m3u8 URL here. + # Example (conceptual - requires actual start/end times and base URL logic): + # if not is_live and media_info.get('startTime') and media_info.get('endTime'): + # start_time = media_info['startTime'] # Assuming these keys exist and hold timestamps + # end_time = media_info['endTime'] + # # Assuming m3u8_links contains a base URL that needs modification + # base_url = m3u8_links.split('/')[0:-1] # Highly simplified base URL extraction + # archive_url = '/'.join(base_url) + f'/index-archive.m3u8?startTime={start_time}&endTime={end_time}' + # m3u8_links = archive_url # Replace the found link with the constructed one + # --- Potential modification area END --- + + + if not m3u8_links: + self.report_warning('Could not find any .m3u8 link in the page. The site structure may have changed.') + # Return basic info if no HLS manifest found return { - 'id': final_id, - 'title': 'European Parliament Stream', # Generic title for direct URLs - 'formats': formats or [], - 'subtitles': subtitles or {}, - 'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL + 'id': media_info.get('id') or display_id, + 'display_id': display_id, + 'title': title, + 'release_timestamp': release_timestamp, + 'formats': [], } - # --- Fallback for multimedia.europarl.europa.eu URLs --- - if not display_id: # Should have display_id if it's not a direct HLS URL - raise ExtractorError('Failed to identify video ID from URL.') - - self.to_screen(f'Processing webpage URL: {url}') - webpage = self._download_webpage(url, display_id) - - # Check for live indicators more reliably - # Look for common live indicators in JS, classes, or text - is_live = bool(re.search( - r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)', - webpage, - re.IGNORECASE)) - self.to_screen(f'Detected as live: {is_live}') - - # Extract title - title = self._extract_title_from_webpage(webpage, display_id) - - # *** Strategy 1: Extract direct URLs from webpage (Preferred) *** - direct_urls = self._extract_direct_url_from_webpage(webpage) - formats = [] - subtitles = {} - - if direct_urls: - self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...') - for m3u8_url in direct_urls: - # Clean stream ID from URL for format identification - m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0] - if m3u8_stream_id.endswith('.m3u8'): - m3u8_stream_id = m3u8_stream_id[:-5] - - try: - fmt, subs = self._extract_m3u8_formats_and_subtitles( - m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error - - if fmt: - self.to_screen(f'Successfully extracted formats from: {m3u8_url}') - formats.extend(fmt) - self._merge_subtitles(subs, target=subtitles) - # If we found formats, we are likely done, return immediately - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL - } - else: - self.to_screen(f'No formats found in: {m3u8_url}') - except ExtractorError as e: - self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}') - pass # Try the next direct URL - else: - self.to_screen('No direct M3U8 URLs found in webpage.') - - - # *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) *** - self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...') - - # Parse timestamps for archive retrieval (or use a window for live/unknown) - # Always parse, even if live, as it might be a recently finished live event - start_timestamp, end_timestamp = self._parse_meeting_date(display_id) - - # Use appropriate stream filenames for the content type - stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES - - # Try combinations with updated endpoints and channels - for endpoint in self.ENDPOINTS: - for channel_to_try in self.PRIORITY_CHANNELS: - for filename in stream_filenames: - base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}" - - # Determine if timestamps should be added - # Add timestamps if it's explicitly not live, OR if the filename suggests archive, - # OR if start/end timestamps were successfully parsed from the ID. - # Avoid timestamps for clearly live filenames unless forced by non-live status. - use_timestamps = ( - (not is_live or 'archive' in filename.lower()) - and start_timestamp and end_timestamp - ) - - test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url - - try: - self.to_screen(f'Trying guessed URL: {test_url}') - fmt, subs = self._extract_m3u8_formats_and_subtitles( - test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False) - - if fmt: - self.to_screen(f'Success with guessed URL: {test_url}') - formats.extend(fmt) - self._merge_subtitles(subs, target=subtitles) - # Found a working combination - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': not use_timestamps, # If we used timestamps, assume not live - } - else: - self.to_screen(f'No formats found in guessed URL: {test_url}') - - except ExtractorError as e: - # Log error lightly, as many guesses are expected to fail - self.to_screen(f'Guessed URL failed: {test_url} ({e})') - pass # Continue trying other combinations - - # *** If all strategies fail *** - self.to_screen('All extraction strategies failed.') - - # Provide helpful error with suggestions - error_message = ( - f"Could not extract stream URL for {display_id or url}. " - "The stream may be old, expired, or use an unsupported format.\n" - f"Live status detected: {is_live}\n" - "Common issues:\n" - "- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n" - "- The event might not be available via the standard CDN endpoints/channels.\n" - "If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n" - "Example (using parsed times, adjust if needed):\n" + # Process all found .m3u8 links (handles case where multiple are found or the first one is a master playlist) + # The regex used here is identical to the one above, ensures we capture all instances + import re + all_links_text = self._html_search_regex( + r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', + webpage, 'all m3u8 URLs', default='', fatal=False, group=0 # Find all occurrences ) - if start_timestamp and end_timestamp: - example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - error_message += f'yt-dlp "{example_url}"' - else: - example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8" - error_message += f'yt-dlp "{example_url}"' + candidates = re.findall(r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', all_links_text) + + # If the specific constructed URL was made above, ensure it's prioritized or the only candidate + # (Refined logic needed here based on the modification above) + if not candidates and m3u8_links: # Fallback if findall failed but initial search worked + candidates = [m3u8_links] + elif m3u8_links not in candidates and m3u8_links: # Ensure the primary (possibly constructed) link is included + candidates.insert(0, m3u8_links) + + candidates = list(dict.fromkeys(candidates)) # Make unique, preserving order + + if not candidates: # Final check if still no candidates + self.report_warning('Could not extract any valid .m3u8 URLs.') + return { + 'id': media_info.get('id') or display_id, + 'display_id': display_id, + 'title': title, + 'release_timestamp': release_timestamp, + 'formats': [], + } + + + formats, subtitles = [], {} + for link in candidates: + # Pass the identified m3u8 URL (could be live, index-archive, or norsk-archive) + # The 'live' flag might need adjustment based on mediaSubType + fmts, subs = self._extract_m3u8_formats_and_subtitles( + link, display_id, ext='mp4', live=is_live, fatal=False) # Pass is_live status + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) - - raise ExtractorError(error_message, expected=True) + return { + 'id': media_info.get('id') or display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'release_timestamp': release_timestamp, + # Report 'is_live' based on detected mediaSubType + 'is_live': is_live or None # Report None if not explicitly Live + } From db1f9be975388d6ace24db638a8d33d7ab947014 Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Fri, 28 Mar 2025 13:14:21 +0000 Subject: [PATCH 6/9] Update europa.py --- yt_dlp/extractor/europa.py | 253 ++++++++++++++++--------------------- 1 file changed, 107 insertions(+), 146 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index cd4cbf4dfd..9d3cfa5a80 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -1,32 +1,32 @@ +# -*- coding: utf-8 -*- from .common import InfoExtractor from ..utils import ( + ExtractorError, # Import ExtractorError for raising specific errors int_or_none, orderedSet, parse_duration, parse_iso8601, parse_qs, qualities, - traverse_obj, + traverse_obj, # Useful for safely navigating nested dictionaries unified_strdate, xpath_text, ) +import re # Import re for findall +# --- EuropaIE (Older extractor - unchanged) --- +# This extractor handles older ec.europa.eu/avservices URLs and is likely defunct. class EuropaIE(InfoExtractor): - _WORKING = False + _WORKING = False # Marked as not working _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P[A-Za-z0-9-]+)' _TESTS = [{ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', 'md5': '574f080699ddd1e19a675b0ddf010371', 'info_dict': { - 'id': 'I107758', - 'ext': 'mp4', - 'title': 'TRADE - Wikileaks on TTIP', + 'id': 'I107758', 'ext': 'mp4', 'title': 'TRADE - Wikileaks on TTIP', 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20150811', - 'duration': 34, - 'view_count': int, - 'formats': 'mincount:3', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150811', + 'duration': 34, 'view_count': int, 'formats': 'mincount:3', }, }, { 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', @@ -37,189 +37,150 @@ class EuropaIE(InfoExtractor): }] def _real_extract(self, url): + # (Implementation remains the same as previous versions) video_id = self._match_id(url) - playlist = self._download_xml( f'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID={video_id}', video_id) - def get_item(type_, preference): items = {} for item in playlist.findall(f'./info/{type_}/item'): - lang, label = ( - xpath_text(item, 'lg', default=None), - xpath_text(item, 'label', default=None) - ) - if lang and label: - items[lang] = label.strip() + lang, label = (xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)) + if lang and label: items[lang] = label.strip() for p in preference: - if items.get(p): - return items[p] - + if items.get(p): return items[p] query = parse_qs(url) preferred_lang = query.get('sitelang', ('en', ))[0] preferred_langs = orderedSet((preferred_lang, 'en', 'int')) - title = get_item('title', preferred_langs) or video_id description = get_item('description', preferred_langs) thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) - language_preference = qualities(preferred_langs[::-1]) - formats = [] for file_ in playlist.findall('./files/file'): video_url = xpath_text(file_, './url') - if not video_url: - continue + if not video_url: continue lang = xpath_text(file_, './lg') - formats.append({ - 'url': video_url, - 'format_id': lang, - 'format_note': xpath_text(file_, './lglabel'), - 'language_preference': language_preference(lang), - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } + formats.append({'url': video_url, 'format_id': lang, 'format_note': xpath_text(file_, './lglabel'), 'language_preference': language_preference(lang)}) + return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats} +# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) --- class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) https?://multimedia\.europarl\.europa\.eu/ - (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) + (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) # Matches /en/webstreaming/event_id format ''' _TESTS = [{ + # Existing VOD test 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { - 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', - 'display_id': '20220914-0900-PLENARY', - 'ext': 'mp4', - 'title': 'Plenary session', - 'release_timestamp': 1663139069, - 'release_date': '20220914', - }, - 'params': { - 'skip_download': True, + 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY', + 'ext': 'mp4', 'title': 'Plenary session', 'release_timestamp': 1663139069, 'release_date': '20220914', }, + 'params': {'skip_download': True}, }, { - # example of old live webstream - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', + # Test case that previously failed with regex method + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA', 'info_dict': { + 'id': str, # ID might be a string UUID or similar + 'display_id': '20250328-1000-SPECIAL-EUROSCOLA', 'ext': 'mp4', - 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715', - 'release_timestamp': 1668502800, - 'title': 'Euroscola 2022-11-15 19:21', - 'release_date': '20221115', - 'live_status': 'is_live', + 'title': r're:Euroscola', # Expect title containing Euroscola + 'release_timestamp': int, # Expecting a Unix timestamp + 'release_date': '20250328', + 'is_live': bool, # Could be True (if near event time) or False }, - 'skip': 'not live anymore', + 'params': {'skip_download': True}, + # Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly }] def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + display_id = self._match_id(url) # Get ID from URL + webpage = self._download_webpage(url, display_id) # Get page HTML - # Try to parse Next.js data for metadata - nextjs = self._search_nextjs_data(webpage, display_id, default={}) - page_props = traverse_obj(nextjs, ('props', 'pageProps'), default={}) - media_info = page_props.get('mediaItem') or {} # Look for start/end times here for archives? + # --- Extract Metadata (prioritize Next.js data) --- + nextjs_data = self._search_nextjs_data(webpage, display_id, default={}) + # Use traverse_obj for safer nested dictionary access + media_info = traverse_obj(nextjs_data, ('props', 'pageProps', 'mediaItem')) or {} + # Extract basic info, falling back to display_id if metadata is sparse + internal_id = media_info.get('id') or display_id title = media_info.get('title') or media_info.get('name') or display_id - release_timestamp = None - # Existing logic uses startDateTime, might need adjustment for archive start/end - if 'startDateTime' in media_info: - release_timestamp = parse_iso8601(media_info['startDateTime']) - - # Determine if it's Live or VOD/Archive (might need refinement) - # mediaSubType might be 'Live' or 'VOD' or something else + release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601})) + # Determine live status based on metadata hint, if available is_live = media_info.get('mediaSubType') == 'Live' - # Search for any .m3u8 link first - m3u8_links = self._search_regex( - r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', - webpage, 'm3u8 URL', default=None, group=1, fatal=False - ) - - # --- Potential modification area START --- - # If it's NOT live, and we have start/end times, and m3u8_links points to a live URL, - # try constructing the index-archive.m3u8 URL here. - # Example (conceptual - requires actual start/end times and base URL logic): - # if not is_live and media_info.get('startTime') and media_info.get('endTime'): - # start_time = media_info['startTime'] # Assuming these keys exist and hold timestamps - # end_time = media_info['endTime'] - # # Assuming m3u8_links contains a base URL that needs modification - # base_url = m3u8_links.split('/')[0:-1] # Highly simplified base URL extraction - # archive_url = '/'.join(base_url) + f'/index-archive.m3u8?startTime={start_time}&endTime={end_time}' - # m3u8_links = archive_url # Replace the found link with the constructed one - # --- Potential modification area END --- - - - if not m3u8_links: - self.report_warning('Could not find any .m3u8 link in the page. The site structure may have changed.') - # Return basic info if no HLS manifest found - return { - 'id': media_info.get('id') or display_id, - 'display_id': display_id, - 'title': title, - 'release_timestamp': release_timestamp, - 'formats': [], - } - - # Process all found .m3u8 links (handles case where multiple are found or the first one is a master playlist) - # The regex used here is identical to the one above, ensures we capture all instances - import re - all_links_text = self._html_search_regex( - r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', - webpage, 'all m3u8 URLs', default='', fatal=False, group=0 # Find all occurrences - ) - candidates = re.findall(r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', all_links_text) - - # If the specific constructed URL was made above, ensure it's prioritized or the only candidate - # (Refined logic needed here based on the modification above) - if not candidates and m3u8_links: # Fallback if findall failed but initial search worked - candidates = [m3u8_links] - elif m3u8_links not in candidates and m3u8_links: # Ensure the primary (possibly constructed) link is included - candidates.insert(0, m3u8_links) - - candidates = list(dict.fromkeys(candidates)) # Make unique, preserving order - - if not candidates: # Final check if still no candidates - self.report_warning('Could not extract any valid .m3u8 URLs.') - return { - 'id': media_info.get('id') or display_id, - 'display_id': display_id, - 'title': title, - 'release_timestamp': release_timestamp, - 'formats': [], - } - - - formats, subtitles = [], {} - for link in candidates: - # Pass the identified m3u8 URL (could be live, index-archive, or norsk-archive) - # The 'live' flag might need adjustment based on mediaSubType - fmts, subs = self._extract_m3u8_formats_and_subtitles( - link, display_id, ext='mp4', live=is_live, fatal=False) # Pass is_live status - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) - + hls_url = None # Variable to store the found HLS URL + + # --- Attempt 1: Find direct HLS URL in media_info --- + # Check common dictionary keys where the full HLS URL might be stored. + # Add more potential keys here if observed in website data. + possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl') + hls_url = traverse_obj(media_info, possible_keys) + if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL + self.to_screen(f'Found direct HLS URL in metadata: {hls_url}') + else: + hls_url = None # Reset if found value wasn't an HLS URL + + # --- Attempt 2: Construct HLS URL from IDs in media_info --- + if not hls_url: + self.to_screen('Attempting to construct HLS URL from metadata IDs...') + # Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common, + # but might differ. Use traverse_obj to safely get values. + # 'id' from media_info is often the event ID. + event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id')) + # Channel ID might be numeric or a string name. + channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel')) + + if event_id and channel_id: + # Construct the URL using the assumed live/default pattern. + # For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed. + # This assumes the event is live or uses the default endpoint. + constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8' + hls_url = constructed_url + self.to_screen(f'Constructed potential HLS URL: {hls_url}') + else: + self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.') + + # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) --- + if not hls_url: + self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...') + m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)' + hls_url = self._search_regex( + m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False) + if hls_url: + self.to_screen(f'Found HLS URL via regex fallback: {hls_url}') + else: + # This is where the original "Could not find any .m3u8 link" warning occurred. + self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.') + + # --- Process HLS Playlist --- + if not hls_url: + # If no URL was found after all attempts, raise an error. + raise ExtractorError( + 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.', + expected=True) # expected=True prevents stack trace for common errors + + # Pass the found HLS URL to the HLS processing function. + # The _extract_m3u8_formats function usually detects live/VOD automatically. + # The 'live=is_live' hint can sometimes help but isn't strictly necessary. + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + hls_url, display_id, ext='mp4', live=is_live, fatal=False) + + # Check if HLS processing returned any formats + if not formats: + raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True) + + # --- Return Extracted Information --- return { - 'id': media_info.get('id') or display_id, + 'id': internal_id, 'display_id': display_id, 'title': title, 'formats': formats, 'subtitles': subtitles, 'release_timestamp': release_timestamp, - # Report 'is_live' based on detected mediaSubType - 'is_live': is_live or None # Report None if not explicitly Live + 'is_live': is_live or None, # Use None if not explicitly marked Live } From d597dc61a220722699c2e90cec61b6943cbc8744 Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Fri, 28 Mar 2025 15:37:09 +0000 Subject: [PATCH 7/9] Update europa.py --- yt_dlp/extractor/europa.py | 104 +++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 38 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 9d3cfa5a80..c768b9f7d7 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -12,7 +12,8 @@ from ..utils import ( unified_strdate, xpath_text, ) -import re # Import re for findall +# Removed unused 're' import, added 'urllib.parse' for potential future use if needed +# but not strictly required for current modification. # --- EuropaIE (Older extractor - unchanged) --- # This extractor handles older ec.europa.eu/avservices URLs and is likely defunct. @@ -67,14 +68,14 @@ class EuropaIE(InfoExtractor): return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats} -# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) --- +# --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) --- class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) https?://multimedia\.europarl\.europa\.eu/ (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) # Matches /en/webstreaming/event_id format ''' _TESTS = [{ - # Existing VOD test + # Existing VOD test (Should now work better if metadata is consistent) 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY', @@ -82,19 +83,18 @@ class EuroParlWebstreamIE(InfoExtractor): }, 'params': {'skip_download': True}, }, { - # Test case that previously failed with regex method + # Test case likely representing an archive/VOD (based on previous context) 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA', 'info_dict': { 'id': str, # ID might be a string UUID or similar 'display_id': '20250328-1000-SPECIAL-EUROSCOLA', 'ext': 'mp4', 'title': r're:Euroscola', # Expect title containing Euroscola - 'release_timestamp': int, # Expecting a Unix timestamp + 'release_timestamp': int, # Expecting a Unix timestamp (start time) 'release_date': '20250328', - 'is_live': bool, # Could be True (if near event time) or False + 'is_live': False, # Should be detected as not live }, 'params': {'skip_download': True}, - # Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly }] def _real_extract(self, url): @@ -109,70 +109,98 @@ class EuroParlWebstreamIE(InfoExtractor): # Extract basic info, falling back to display_id if metadata is sparse internal_id = media_info.get('id') or display_id title = media_info.get('title') or media_info.get('name') or display_id - release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601})) + + # Extract start and end timestamps, if available + # parse_iso8601 typically returns a float/int timestamp + start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none})) + end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none})) + release_timestamp = start_timestamp # Use start time as the release timestamp + # Determine live status based on metadata hint, if available + # Treat as not live if 'Live' subtype isn't explicitly present is_live = media_info.get('mediaSubType') == 'Live' hls_url = None # Variable to store the found HLS URL # --- Attempt 1: Find direct HLS URL in media_info --- # Check common dictionary keys where the full HLS URL might be stored. - # Add more potential keys here if observed in website data. possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl') hls_url = traverse_obj(media_info, possible_keys) if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL self.to_screen(f'Found direct HLS URL in metadata: {hls_url}') + # Check if it's an archive URL but missing time params - might need correction later if it fails + if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp: + self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.') + hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}' + self.to_screen(f'Corrected direct HLS URL: {hls_url}') + else: - hls_url = None # Reset if found value wasn't an HLS URL + hls_url = None # Reset if found value wasn't an HLS URL or needs construction - # --- Attempt 2: Construct HLS URL from IDs in media_info --- + # --- Attempt 2: Construct HLS URL from IDs and Times in media_info --- if not hls_url: - self.to_screen('Attempting to construct HLS URL from metadata IDs...') - # Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common, - # but might differ. Use traverse_obj to safely get values. - # 'id' from media_info is often the event ID. + self.to_screen('Attempting to construct HLS URL from metadata...') event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id')) - # Channel ID might be numeric or a string name. channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel')) if event_id and channel_id: - # Construct the URL using the assumed live/default pattern. - # For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed. - # This assumes the event is live or uses the default endpoint. - constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8' - hls_url = constructed_url - self.to_screen(f'Constructed potential HLS URL: {hls_url}') + if not is_live and start_timestamp and end_timestamp: + # Construct ARCHIVE/VOD URL with time parameters + constructed_url = ( + f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/' + f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}' + ) + hls_url = constructed_url + self.to_screen(f'Constructed Archive HLS URL: {hls_url}') + elif is_live: + # Construct LIVE URL (basic pattern, might need adjustments) + constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8' + hls_url = constructed_url + self.to_screen(f'Constructed Live HLS URL: {hls_url}') + else: + self.to_screen('Could not construct URL: Missing live status or timestamps for archive.') else: - self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.') + self.to_screen('Could not construct URL: Missing event or channel ID in metadata.') # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) --- if not hls_url: self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...') - m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)' + # Updated regex to potentially capture archive URLs with parameters, but prioritize construction + m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)' hls_url = self._search_regex( m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False) if hls_url: - self.to_screen(f'Found HLS URL via regex fallback: {hls_url}') + self.to_screen(f'Found HLS URL via regex fallback: {hls_url}') + # If regex found an archive URL without params, try adding them as a last resort + if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp: + self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.') + hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}' + self.to_screen(f'Corrected regex HLS URL: {hls_url}') else: - # This is where the original "Could not find any .m3u8 link" warning occurred. - self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.') + self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.') # --- Process HLS Playlist --- if not hls_url: - # If no URL was found after all attempts, raise an error. - raise ExtractorError( - 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.', - expected=True) # expected=True prevents stack trace for common errors - - # Pass the found HLS URL to the HLS processing function. - # The _extract_m3u8_formats function usually detects live/VOD automatically. - # The 'live=is_live' hint can sometimes help but isn't strictly necessary. + raise ExtractorError( + 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.', + expected=True) + + # Pass the final HLS URL to the processing function formats, subtitles = self._extract_m3u8_formats_and_subtitles( - hls_url, display_id, ext='mp4', live=is_live, fatal=False) + hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats # Check if HLS processing returned any formats if not formats: - raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True) + # Try again, forcing VOD interpretation if it was marked live but failed + if is_live: + self.to_screen('Live HLS processing failed, attempting again as VOD...') + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + hls_url, display_id, ext='mp4', live=False, fatal=False) + + # If still no formats, raise error + if not formats: + raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True) + # --- Return Extracted Information --- return { @@ -182,5 +210,5 @@ class EuroParlWebstreamIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'release_timestamp': release_timestamp, - 'is_live': is_live or None, # Use None if not explicitly marked Live + 'is_live': is_live, # Keep original detected live status } From 5711fa1dc853fba30b4e9652fab0a1344d830336 Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Fri, 28 Mar 2025 16:24:48 +0000 Subject: [PATCH 8/9] Update europa.py --- yt_dlp/extractor/europa.py | 370 +++++++++++++++++++++---------------- 1 file changed, 214 insertions(+), 156 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index c768b9f7d7..d0f17c16fe 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -1,44 +1,33 @@ # -*- coding: utf-8 -*- from .common import InfoExtractor from ..utils import ( - ExtractorError, # Import ExtractorError for raising specific errors + ExtractorError, int_or_none, orderedSet, parse_duration, parse_iso8601, parse_qs, qualities, - traverse_obj, # Useful for safely navigating nested dictionaries + traverse_obj, unified_strdate, xpath_text, + js_to_json, + urljoin, + filter_dict, + HEADRequest, # Import HEADRequest ) -# Removed unused 're' import, added 'urllib.parse' for potential future use if needed -# but not strictly required for current modification. +import re +import json +import urllib.error # Import urllib.error for HEAD check exception -# --- EuropaIE (Older extractor - unchanged) --- -# This extractor handles older ec.europa.eu/avservices URLs and is likely defunct. +# --- EuropaIE (Unchanged) --- class EuropaIE(InfoExtractor): - _WORKING = False # Marked as not working + _WORKING = False _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P[A-Za-z0-9-]+)' - _TESTS = [{ - 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', - 'md5': '574f080699ddd1e19a675b0ddf010371', - 'info_dict': { - 'id': 'I107758', 'ext': 'mp4', 'title': 'TRADE - Wikileaks on TTIP', - 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', - 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150811', - 'duration': 34, 'view_count': int, 'formats': 'mincount:3', - }, - }, { - 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', - 'only_matching': True, - }, { - 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en', - 'only_matching': True, - }] - + _TESTS = [ + # Existing tests... + ] def _real_extract(self, url): - # (Implementation remains the same as previous versions) video_id = self._match_id(url) playlist = self._download_xml( f'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID={video_id}', video_id) @@ -68,147 +57,216 @@ class EuropaIE(InfoExtractor): return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats} -# --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) --- +# --- EuroParlWebstreamIE (Using JSON from iframe) --- class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://multimedia\.europarl\.europa\.eu/ - (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) # Matches /en/webstreaming/event_id format + https?://(?: + multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| # Webstreaming page URL + live\.media\.eup\.glcloud\.eu/hls/live/(?P\d+)/(?Pchannel-\d+-\w+|[\w-]+)/(?Pindex-archive|index|master|playlist|norsk-archive)(?:\.m3u8)? # Direct HLS URL base + ) ''' - _TESTS = [{ - # Existing VOD test (Should now work better if metadata is consistent) - 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', - 'info_dict': { - 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY', - 'ext': 'mp4', 'title': 'Plenary session', 'release_timestamp': 1663139069, 'release_date': '20220914', + _TESTS = [ + { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-agriculture-and-rural-development_20250327-0900-COMMITTEE-AGRI', + 'info_dict': { + 'id': '20250327-0900-COMMITTEE-AGRI', + 'title': r're:^Committee on Agriculture and Rural Development \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': False, + 'ext': 'mp4', + }, + 'params': {'skip_download': True}, + # Uses the iframe JSON parsing which should yield 2113752 / channel-06-bxl + }, + { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/pre-session-briefing_20250328-1100-SPECIAL-PRESSEr', + 'info_dict': { + 'id': '20250328-1100-SPECIAL-PRESSEr', + 'title': r're:^Pre-session briefing \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': False, + 'ext': 'mp4', + }, + 'params': {'skip_download': True}, + # Uses the iframe JSON parsing which should yield 2113747 / channel-01-bxl }, - 'params': {'skip_download': True}, - }, { - # Test case likely representing an archive/VOD (based on previous context) - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA', - 'info_dict': { - 'id': str, # ID might be a string UUID or similar - 'display_id': '20250328-1000-SPECIAL-EUROSCOLA', - 'ext': 'mp4', - 'title': r're:Euroscola', # Expect title containing Euroscola - 'release_timestamp': int, # Expecting a Unix timestamp (start time) - 'release_date': '20250328', - 'is_live': False, # Should be detected as not live + { # Test direct HLS URL with archive times + 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113752/channel-06-bxl/index-archive.m3u8?startTime=1743068400&endTime=1743079800', + 'info_dict': { + 'id': 'index-archive', + 'title': 'European Parliament Stream 2113752/channel-06-bxl', + 'is_live': False, # Should be detected as not live from lack of live tags/duration + 'ext': 'mp4', + }, + 'params': {'skip_download': True}, }, - 'params': {'skip_download': True}, - }] + # Potentially add a known live stream test if one is available + ] + + def _log_debug(self, msg): + self.to_screen(f"[EuroParlWebstream] {msg}") + + def _extract_title_from_webpage(self, webpage, display_id): + """Extracts title from the main webstreaming page.""" + title_element = self._search_regex(r']*>(.*?)', webpage, 'title element', default=None) + if title_element: + # Clean up potential extra whitespace and HTML entities + title = re.sub(r'\s+', ' ', title_element).strip() + title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=title) + else: + # Fallback using meta tags or just the ID + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, default=display_id) + return title.replace('_', ' ') # Replace underscores often used in IDs + + def _perform_head_check(self, url, display_id, note=''): + """Performs a HEAD request to check if the HLS URL likely exists.""" + self._log_debug(f'[{display_id}] Performing HEAD check {note}on: {url}') + try: + self._request_webpage(HEADRequest(url), display_id, note=f'HEAD check {note}') + self._log_debug(f'[{display_id}] HEAD check {note}successful.') + return True + except ExtractorError as e: + # Specifically catch HTTP errors, especially 404 + if isinstance(e.cause, urllib.error.HTTPError): + self._log_debug(f'[{display_id}] HEAD check {note}failed: {e.cause.code} {e.cause.reason}') + else: + self._log_debug(f'[{display_id}] HEAD check {note}failed: {e}') + return False def _real_extract(self, url): - display_id = self._match_id(url) # Get ID from URL - webpage = self._download_webpage(url, display_id) # Get page HTML + mobj = self._match_valid_url(url) + display_id = mobj.group('id') + live_id_direct = mobj.group('live_id') + + # --- Handle Direct HLS URL Input --- + if live_id_direct: + self._log_debug(f"Processing Direct HLS URL: {url}") + channel_direct = mobj.group('channel') + stream_type_direct = mobj.group('stream_type') or 'stream' # Default name if not specified + base_url = f'https://live.media.eup.glcloud.eu/hls/live/{live_id_direct}/{channel_direct}/{stream_type_direct}' - # --- Extract Metadata (prioritize Next.js data) --- + query_params_str = mobj.group(0).split('?', 1)[1] if '?' in mobj.group(0) else None + query_params = parse_qs(query_params_str) if query_params_str else {} + start_time_direct = traverse_obj(query_params, ('startTime', 0, {int_or_none})) + end_time_direct = traverse_obj(query_params, ('endTime', 0, {int_or_none})) + + # Construct the final URL ensuring .m3u8 is present + final_url = base_url + ('' if base_url.endswith('.m3u8') else '.m3u8') + if start_time_direct and end_time_direct: + final_url += f"?startTime={start_time_direct}&endTime={end_time_direct}" + elif query_params_str: # Append original query if not start/end time based + final_url += f"?{query_params_str}" + + # Basic title for direct URL + title = f'European Parliament Stream {live_id_direct}/{channel_direct}' + + # HEAD check is good even for direct URLs + if not self._perform_head_check(final_url, f"{live_id_direct}-{channel_direct}", '(direct)'): + raise ExtractorError(f'Direct HLS URL HEAD check failed: {final_url}', expected=True) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + final_url, display_id or stream_type_direct, 'mp4', m3u8_id='hls', fatal=True) + if not formats: raise ExtractorError(f'Could not extract formats from direct HLS URL: {final_url}', expected=True) + + return { + 'id': display_id or stream_type_direct, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': not (start_time_direct and end_time_direct) and '.m3u8' not in stream_type_direct # Guess based on URL structure + } + + # --- Handle Webstreaming Page URL --- + if not display_id: raise ExtractorError('Could not parse display ID from URL', expected=True) + + self._log_debug(f"Processing Webstreaming Page: {display_id}") + webpage = self._download_webpage(url, display_id) + title = self._extract_title_from_webpage(webpage, display_id) # Get title early + + self._log_debug(f'[{display_id}] Extracting metadata and iframe URL...') nextjs_data = self._search_nextjs_data(webpage, display_id, default={}) - # Use traverse_obj for safer nested dictionary access media_info = traverse_obj(nextjs_data, ('props', 'pageProps', 'mediaItem')) or {} - # Extract basic info, falling back to display_id if metadata is sparse - internal_id = media_info.get('id') or display_id - title = media_info.get('title') or media_info.get('name') or display_id - - # Extract start and end timestamps, if available - # parse_iso8601 typically returns a float/int timestamp - start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none})) - end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none})) - release_timestamp = start_timestamp # Use start time as the release timestamp - - # Determine live status based on metadata hint, if available - # Treat as not live if 'Live' subtype isn't explicitly present - is_live = media_info.get('mediaSubType') == 'Live' - - hls_url = None # Variable to store the found HLS URL - - # --- Attempt 1: Find direct HLS URL in media_info --- - # Check common dictionary keys where the full HLS URL might be stored. - possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl') - hls_url = traverse_obj(media_info, possible_keys) - if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL - self.to_screen(f'Found direct HLS URL in metadata: {hls_url}') - # Check if it's an archive URL but missing time params - might need correction later if it fails - if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp: - self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.') - hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}' - self.to_screen(f'Corrected direct HLS URL: {hls_url}') + # Get initial start time, but prioritize iframe JSON later + initial_start_timestamp = traverse_obj(media_info, ('mediaDate', {parse_iso8601}, {int_or_none})) + iframe_url = traverse_obj(media_info, 'iframeUrls') # Usually just one URL string - else: - hls_url = None # Reset if found value wasn't an HLS URL or needs construction - - # --- Attempt 2: Construct HLS URL from IDs and Times in media_info --- - if not hls_url: - self.to_screen('Attempting to construct HLS URL from metadata...') - event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id')) - channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel')) - - if event_id and channel_id: - if not is_live and start_timestamp and end_timestamp: - # Construct ARCHIVE/VOD URL with time parameters - constructed_url = ( - f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/' - f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}' - ) - hls_url = constructed_url - self.to_screen(f'Constructed Archive HLS URL: {hls_url}') - elif is_live: - # Construct LIVE URL (basic pattern, might need adjustments) - constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8' - hls_url = constructed_url - self.to_screen(f'Constructed Live HLS URL: {hls_url}') - else: - self.to_screen('Could not construct URL: Missing live status or timestamps for archive.') - else: - self.to_screen('Could not construct URL: Missing event or channel ID in metadata.') - - # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) --- - if not hls_url: - self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...') - # Updated regex to potentially capture archive URLs with parameters, but prioritize construction - m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)' - hls_url = self._search_regex( - m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False) - if hls_url: - self.to_screen(f'Found HLS URL via regex fallback: {hls_url}') - # If regex found an archive URL without params, try adding them as a last resort - if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp: - self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.') - hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}' - self.to_screen(f'Corrected regex HLS URL: {hls_url}') + self._log_debug(f'[{display_id}] Initial Start Time={initial_start_timestamp}, Iframe URL={iframe_url}') + + if not iframe_url: + raise ExtractorError(f'[{display_id}] Could not find iframe URL in page metadata.', expected=True) + + # --- Attempt Extraction from Iframe JSON --- + self._log_debug(f'[{display_id}] Attempting extraction from iframe: {iframe_url}') + try: + iframe_content = self._download_webpage(iframe_url, display_id, note='Downloading iframe content') + json_data_str = self._search_regex( + r'', + iframe_content, 'iframe JSON data', default=None) + + if not json_data_str: + raise ExtractorError('Could not find ng-state JSON in iframe content.') + + iframe_json = self._parse_json(json_data_str, display_id, fatal=True) + + # Extract required info from the JSON structure + player_url_base = traverse_obj(iframe_json, ('contentEventKey', 'playerUrl')) + start_time = traverse_obj(iframe_json, ('contentEventKey', 'startTime', {int_or_none})) + end_time = traverse_obj(iframe_json, ('contentEventKey', 'endTime', {int_or_none})) + is_live = traverse_obj(iframe_json, ('contentEventKey', 'live')) # boolean + # Use title from JSON if available and seems better + json_title = traverse_obj(iframe_json, ('contentEventKey', 'title')) + if json_title: title = json_title + + + self._log_debug(f'[{display_id}] Found in iframe JSON: playerUrl={player_url_base}, startTime={start_time}, endTime={end_time}, is_live={is_live}') + + if not player_url_base: + raise ExtractorError('Could not extract playerUrl from iframe JSON.') + + # For recorded streams (archives), startTime and endTime are essential + if not is_live and (start_time is None or end_time is None): + raise ExtractorError('Missing startTime or endTime in iframe JSON for recorded stream.') + + # Construct the final URL + # Ensure base URL doesn't already have query params before adding ours + player_url_base = player_url_base.split('?')[0] + if not player_url_base.endswith('.m3u8'): + player_url_base += '.m3u8' # Ensure correct extension + + if is_live: + final_player_url = player_url_base # Live streams don't use start/end times else: - self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.') - - # --- Process HLS Playlist --- - if not hls_url: - raise ExtractorError( - 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.', - expected=True) - - # Pass the final HLS URL to the processing function - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats - - # Check if HLS processing returned any formats - if not formats: - # Try again, forcing VOD interpretation if it was marked live but failed - if is_live: - self.to_screen('Live HLS processing failed, attempting again as VOD...') - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - hls_url, display_id, ext='mp4', live=False, fatal=False) - - # If still no formats, raise error - if not formats: - raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True) - - - # --- Return Extracted Information --- - return { - 'id': internal_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'release_timestamp': release_timestamp, - 'is_live': is_live, # Keep original detected live status - } + final_player_url = f"{player_url_base}?startTime={start_time}&endTime={end_time}" + + # Perform HEAD check on the constructed URL + if not self._perform_head_check(final_player_url, display_id, '(dynamic)'): + raise ExtractorError(f'Dynamic HLS URL from iframe failed HEAD check: {final_player_url}') + + # Extract formats + self._log_debug(f'[{display_id}] Extracting formats from {final_player_url}') + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + final_player_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=True) # Use fatal=True, if extraction fails, it's an error + + if not formats: + raise ExtractorError(f'Could not extract M3U8 formats from {final_player_url}', expected=True) + + return { + 'id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + 'timestamp': start_time if not is_live else None, # Use JSON start time for VOD + 'duration': (end_time - start_time) if not is_live and start_time and end_time else None, + } + + except ExtractorError as e: + # Re-raise specific extractor errors + raise e + except Exception as e: + # Wrap unexpected errors + raise ExtractorError(f'[{display_id}] Error processing iframe content: {e}', cause=e) + + # This part should ideally not be reached if iframe extraction is mandatory + raise ExtractorError(f'[{display_id}] Failed to extract stream information from iframe.', expected=True) From 56124b0ac4bc8a2c24473569e23663e081370929 Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Mon, 31 Mar 2025 17:35:50 +0100 Subject: [PATCH 9/9] changed based on D Trombett repo https://github.com/yt-dlp/yt-dlp/pull/12775/commits/c747e15cdf65ec4bf00f80f4cccd92832bd720fd --- yt_dlp/extractor/europa.py | 432 ++++++++++++++++++------------------- 1 file changed, 208 insertions(+), 224 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index d0f17c16fe..58b41816ee 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -1,272 +1,256 @@ -# -*- coding: utf-8 -*- from .common import InfoExtractor from ..utils import ( - ExtractorError, int_or_none, orderedSet, parse_duration, parse_iso8601, parse_qs, qualities, + str_or_none, traverse_obj, unified_strdate, + url_or_none, xpath_text, - js_to_json, - urljoin, - filter_dict, - HEADRequest, # Import HEADRequest ) -import re -import json -import urllib.error # Import urllib.error for HEAD check exception -# --- EuropaIE (Unchanged) --- + class EuropaIE(InfoExtractor): _WORKING = False _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P[A-Za-z0-9-]+)' - _TESTS = [ - # Existing tests... - ] + _TESTS = [{ + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', + 'md5': '574f080699ddd1e19a675b0ddf010371', + 'info_dict': { + 'id': 'I107758', + 'ext': 'mp4', + 'title': 'TRADE - Wikileaks on TTIP', + 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20150811', + 'duration': 34, + 'view_count': int, + 'formats': 'mincount:3', + }, + }, { + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', + 'only_matching': True, + }, { + 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en', + 'only_matching': True, + }] + def _real_extract(self, url): video_id = self._match_id(url) + playlist = self._download_xml( f'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID={video_id}', video_id) + def get_item(type_, preference): items = {} for item in playlist.findall(f'./info/{type_}/item'): - lang, label = (xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)) - if lang and label: items[lang] = label.strip() + lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) + if lang and label: + items[lang] = label.strip() for p in preference: - if items.get(p): return items[p] + if items.get(p): + return items[p] + query = parse_qs(url) preferred_lang = query.get('sitelang', ('en', ))[0] + preferred_langs = orderedSet((preferred_lang, 'en', 'int')) + title = get_item('title', preferred_langs) or video_id description = get_item('description', preferred_langs) thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) + language_preference = qualities(preferred_langs[::-1]) + formats = [] for file_ in playlist.findall('./files/file'): video_url = xpath_text(file_, './url') - if not video_url: continue + if not video_url: + continue lang = xpath_text(file_, './lg') - formats.append({'url': video_url, 'format_id': lang, 'format_note': xpath_text(file_, './lglabel'), 'language_preference': language_preference(lang)}) - return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats} + formats.append({ + 'url': video_url, + 'format_id': lang, + 'format_note': xpath_text(file_, './lglabel'), + 'language_preference': language_preference(lang), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } -# --- EuroParlWebstreamIE (Using JSON from iframe) --- class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?: - multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| # Webstreaming page URL - live\.media\.eup\.glcloud\.eu/hls/live/(?P\d+)/(?Pchannel-\d+-\w+|[\w-]+)/(?Pindex-archive|index|master|playlist|norsk-archive)(?:\.m3u8)? # Direct HLS URL base - ) + https?://multimedia\.europarl\.europa\.eu/ + (?P[^/]*/)?webstreaming/(?:[^_]*_)?(?P[\w-]+) ''' - _TESTS = [ - { - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-agriculture-and-rural-development_20250327-0900-COMMITTEE-AGRI', - 'info_dict': { - 'id': '20250327-0900-COMMITTEE-AGRI', - 'title': r're:^Committee on Agriculture and Rural Development \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': False, - 'ext': 'mp4', - }, - 'params': {'skip_download': True}, - # Uses the iframe JSON parsing which should yield 2113752 / channel-06-bxl + _TESTS = [{ + 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', + 'md5': '16420ad9c602663759538ac1ca16a8db', + 'info_dict': { + 'id': '20220914-0900-PLENARY', + 'ext': 'mp4', + 'title': 'Plenary session', + 'description': '', + 'duration': 45147, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'release_timestamp': 1663139069, + 'release_date': '20220914', + 'modified_timestamp': 1663650921, + 'modified_date': '20220920', + 'live_status': 'was_live', }, - { - 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/pre-session-briefing_20250328-1100-SPECIAL-PRESSEr', - 'info_dict': { - 'id': '20250328-1100-SPECIAL-PRESSEr', - 'title': r're:^Pre-session briefing \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': False, - 'ext': 'mp4', - }, - 'params': {'skip_download': True}, - # Uses the iframe JSON parsing which should yield 2113747 / channel-01-bxl + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', + 'md5': '8b4304f9e15a6e133100248fb55a5dce', + 'info_dict': { + 'ext': 'mp4', + 'id': '20221115-1000-SPECIAL-EUROSCOLA', + 'release_timestamp': 1668502798, + 'title': 'Euroscola', + 'release_date': '20221115', + 'live_status': 'was_live', + 'description': '', + 'duration': 9587, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'modified_timestamp': 1668945274, + 'modified_date': '20221120', }, - { # Test direct HLS URL with archive times - 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113752/channel-06-bxl/index-archive.m3u8?startTime=1743068400&endTime=1743079800', - 'info_dict': { - 'id': 'index-archive', - 'title': 'European Parliament Stream 2113752/channel-06-bxl', - 'is_live': False, # Should be detected as not live from lack of live tags/duration - 'ext': 'mp4', - }, - 'params': {'skip_download': True}, + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', + 'md5': '0ca01cf33009d866e6f5e1cd3088c10c', + 'info_dict': { + 'id': '20230301-1130-COMMITTEE-CULT', + 'ext': 'mp4', + 'release_date': '20230301', + 'title': 'Committee on Culture and Education', + 'release_timestamp': 1677666641, + 'description': 'Committee on Culture and Education', + 'duration': 1003, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'modified_timestamp': 1732475771, + 'modified_date': '20241124', + 'live_status': 'was_live', }, - # Potentially add a known live stream test if one is available - ] - - def _log_debug(self, msg): - self.to_screen(f"[EuroParlWebstream] {msg}") - - def _extract_title_from_webpage(self, webpage, display_id): - """Extracts title from the main webstreaming page.""" - title_element = self._search_regex(r']*>(.*?)', webpage, 'title element', default=None) - if title_element: - # Clean up potential extra whitespace and HTML entities - title = re.sub(r'\s+', ' ', title_element).strip() - title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=title) - else: - # Fallback using meta tags or just the ID - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, default=display_id) - return title.replace('_', ' ') # Replace underscores often used in IDs - - def _perform_head_check(self, url, display_id, note=''): - """Performs a HEAD request to check if the HLS URL likely exists.""" - self._log_debug(f'[{display_id}] Performing HEAD check {note}on: {url}') - try: - self._request_webpage(HEADRequest(url), display_id, note=f'HEAD check {note}') - self._log_debug(f'[{display_id}] HEAD check {note}successful.') - return True - except ExtractorError as e: - # Specifically catch HTTP errors, especially 404 - if isinstance(e.cause, urllib.error.HTTPError): - self._log_debug(f'[{display_id}] HEAD check {note}failed: {e.cause.code} {e.cause.reason}') - else: - self._log_debug(f'[{display_id}] HEAD check {note}failed: {e}') - return False + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI', + 'md5': 'f2e8c30935f956a7165c2f4f4b4ee090', + 'info_dict': { + 'id': '20230524-0900-COMMITTEE-ENVI', + 'ext': 'mp4', + 'release_date': '20230524', + 'title': 'Committee on Environment, Public Health and Food Safety', + 'release_timestamp': 1684912288, + 'live_status': 'was_live', + 'description': 'Committee on Environment, Public Health and Food Safety', + 'duration': 4831, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'modified_timestamp': 1732475771, + 'modified_date': '20241124', + }, + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20240320-1345-SPECIAL-PRESSER', + 'md5': '518758eb706471c4c4ef3a134034a5bd', + 'info_dict': { + 'id': '20240320-1345-SPECIAL-PRESSER', + 'ext': 'mp4', + 'release_date': '20240320', + 'title': 'md5:7c6c814cac55dea5e2d87bf8d3db2234', + 'release_timestamp': 1710939767, + 'description': 'md5:7c6c814cac55dea5e2d87bf8d3db2234', + 'duration': 927, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'modified_timestamp': 1732475771, + 'modified_date': '20241124', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20250328-1600-SPECIAL-PRESSER', + 'md5': 'dd1c5e67eb55e609998583d7c2966105', + 'info_dict': { + 'id': '20250328-1600-SPECIAL-PRESSER', + 'ext': 'mp4', + 'title': 'md5:04a2ab70c183dabe891a7cd190c3121d', + 'description': '', + 'duration': 1023, + 'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png', + 'release_timestamp': 1743177199, + 'release_date': '20250328', + 'modified_timestamp': 1743180924, + 'modified_date': '20250328', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://multimedia.europarl.europa.eu/webstreaming/briefing-for-media-on-2024-european-elections_20240429-1000-SPECIAL-OTHER', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - display_id = mobj.group('id') - live_id_direct = mobj.group('live_id') - - # --- Handle Direct HLS URL Input --- - if live_id_direct: - self._log_debug(f"Processing Direct HLS URL: {url}") - channel_direct = mobj.group('channel') - stream_type_direct = mobj.group('stream_type') or 'stream' # Default name if not specified - base_url = f'https://live.media.eup.glcloud.eu/hls/live/{live_id_direct}/{channel_direct}/{stream_type_direct}' - - query_params_str = mobj.group(0).split('?', 1)[1] if '?' in mobj.group(0) else None - query_params = parse_qs(query_params_str) if query_params_str else {} - start_time_direct = traverse_obj(query_params, ('startTime', 0, {int_or_none})) - end_time_direct = traverse_obj(query_params, ('endTime', 0, {int_or_none})) - - # Construct the final URL ensuring .m3u8 is present - final_url = base_url + ('' if base_url.endswith('.m3u8') else '.m3u8') - if start_time_direct and end_time_direct: - final_url += f"?startTime={start_time_direct}&endTime={end_time_direct}" - elif query_params_str: # Append original query if not start/end time based - final_url += f"?{query_params_str}" - - # Basic title for direct URL - title = f'European Parliament Stream {live_id_direct}/{channel_direct}' - - # HEAD check is good even for direct URLs - if not self._perform_head_check(final_url, f"{live_id_direct}-{channel_direct}", '(direct)'): - raise ExtractorError(f'Direct HLS URL HEAD check failed: {final_url}', expected=True) - - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - final_url, display_id or stream_type_direct, 'mp4', m3u8_id='hls', fatal=True) - if not formats: raise ExtractorError(f'Could not extract formats from direct HLS URL: {final_url}', expected=True) - - return { - 'id': display_id or stream_type_direct, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': not (start_time_direct and end_time_direct) and '.m3u8' not in stream_type_direct # Guess based on URL structure - } - - # --- Handle Webstreaming Page URL --- - if not display_id: raise ExtractorError('Could not parse display ID from URL', expected=True) - - self._log_debug(f"Processing Webstreaming Page: {display_id}") - webpage = self._download_webpage(url, display_id) - title = self._extract_title_from_webpage(webpage, display_id) # Get title early - - self._log_debug(f'[{display_id}] Extracting metadata and iframe URL...') - nextjs_data = self._search_nextjs_data(webpage, display_id, default={}) - media_info = traverse_obj(nextjs_data, ('props', 'pageProps', 'mediaItem')) or {} - - # Get initial start time, but prioritize iframe JSON later - initial_start_timestamp = traverse_obj(media_info, ('mediaDate', {parse_iso8601}, {int_or_none})) - iframe_url = traverse_obj(media_info, 'iframeUrls') # Usually just one URL string - - self._log_debug(f'[{display_id}] Initial Start Time={initial_start_timestamp}, Iframe URL={iframe_url}') - - if not iframe_url: - raise ExtractorError(f'[{display_id}] Could not find iframe URL in page metadata.', expected=True) - - # --- Attempt Extraction from Iframe JSON --- - self._log_debug(f'[{display_id}] Attempting extraction from iframe: {iframe_url}') - try: - iframe_content = self._download_webpage(iframe_url, display_id, note='Downloading iframe content') - json_data_str = self._search_regex( - r'', - iframe_content, 'iframe JSON data', default=None) - - if not json_data_str: - raise ExtractorError('Could not find ng-state JSON in iframe content.') - - iframe_json = self._parse_json(json_data_str, display_id, fatal=True) - - # Extract required info from the JSON structure - player_url_base = traverse_obj(iframe_json, ('contentEventKey', 'playerUrl')) - start_time = traverse_obj(iframe_json, ('contentEventKey', 'startTime', {int_or_none})) - end_time = traverse_obj(iframe_json, ('contentEventKey', 'endTime', {int_or_none})) - is_live = traverse_obj(iframe_json, ('contentEventKey', 'live')) # boolean - # Use title from JSON if available and seems better - json_title = traverse_obj(iframe_json, ('contentEventKey', 'title')) - if json_title: title = json_title - - - self._log_debug(f'[{display_id}] Found in iframe JSON: playerUrl={player_url_base}, startTime={start_time}, endTime={end_time}, is_live={is_live}') - - if not player_url_base: - raise ExtractorError('Could not extract playerUrl from iframe JSON.') - - # For recorded streams (archives), startTime and endTime are essential - if not is_live and (start_time is None or end_time is None): - raise ExtractorError('Missing startTime or endTime in iframe JSON for recorded stream.') - - # Construct the final URL - # Ensure base URL doesn't already have query params before adding ours - player_url_base = player_url_base.split('?')[0] - if not player_url_base.endswith('.m3u8'): - player_url_base += '.m3u8' # Ensure correct extension - - if is_live: - final_player_url = player_url_base # Live streams don't use start/end times - else: - final_player_url = f"{player_url_base}?startTime={start_time}&endTime={end_time}" - - # Perform HEAD check on the constructed URL - if not self._perform_head_check(final_player_url, display_id, '(dynamic)'): - raise ExtractorError(f'Dynamic HLS URL from iframe failed HEAD check: {final_player_url}') - - # Extract formats - self._log_debug(f'[{display_id}] Extracting formats from {final_player_url}') - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - final_player_url, display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=True) # Use fatal=True, if extraction fails, it's an error - - if not formats: - raise ExtractorError(f'Could not extract M3U8 formats from {final_player_url}', expected=True) - - return { - 'id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - 'timestamp': start_time if not is_live else None, # Use JSON start time for VOD - 'duration': (end_time - start_time) if not is_live and start_time and end_time else None, - } - - except ExtractorError as e: - # Re-raise specific extractor errors - raise e - except Exception as e: - # Wrap unexpected errors - raise ExtractorError(f'[{display_id}] Error processing iframe content: {e}', cause=e) - - # This part should ideally not be reached if iframe extraction is mandatory - raise ExtractorError(f'[{display_id}] Failed to extract stream information from iframe.', expected=True) + lang, video_id = self._match_valid_url(url).group('lang', 'id') + query = { + 'lang': lang, + 'audio': lang, + 'autoplay': 'true', + 'logo': 'false', + 'muted': 'false', + 'fullscreen': 'true', + 'disclaimer': 'false', + 'multicast': 'true', + 'analytics': 'false', + } + webpage = self._download_webpage(f'https://control.eup.glcloud.eu/content-manager/content-page/{video_id}', + video_id, 'Downloading iframe', query=query) + stream_info = self._search_json(r'