From 2afe7e2fa3b21ab6f6879a05a06043eedbc864eb Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Tue, 25 Mar 2025 16:02:51 +0000 Subject: [PATCH] Update europa.py this makes it work with videos from the archive but not live videos --- yt_dlp/extractor/europa.py | 205 ++++++++++++++----------------------- 1 file changed, 76 insertions(+), 129 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 7470305e95..c3a03bf591 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -10,10 +10,13 @@ from ..utils import ( unified_strdate, xpath_text, ExtractorError, + js_to_json, + urljoin ) import re -import datetime +import json import time +import datetime class EuropaIE(InfoExtractor): @@ -132,7 +135,6 @@ class EuroParlWebstreamIE(InfoExtractor): 'display_id': '20250324-1500-COMMITTEE-HOUS', 'ext': 'mp4', 'title': 'Special committee on the Housing Crisis in the European Union Ordinary meeting', - 'is_live': False, }, 'params': { 'skip_download': True, @@ -140,21 +142,13 @@ class EuroParlWebstreamIE(InfoExtractor): }] # Known working stream IDs (in order of likely success) - _ARCHIVE_STREAM_IDS = [ + KNOWN_STREAM_IDS = [ "index-archive", "norsk-archive", ] - - # Live stream IDs - _LIVE_STREAM_IDS = [ - "index", - "master", - "playlist", - "norsk", - ] # Known CDN endpoints (in order of likely success) - _ENDPOINTS = [ + KNOWN_ENDPOINTS = [ "2113753", # This appears to be the main endpoint "2113749", "2113750", @@ -164,7 +158,7 @@ class EuroParlWebstreamIE(InfoExtractor): ] # Prioritized channel list based on observations (channel-07-bxl is often used) - _CHANNELS = [ + PRIORITIZED_CHANNELS = [ "channel-07-bxl", # Most common based on examples "channel-03-bxl", # Also seen in examples "channel-01-bxl", @@ -179,6 +173,7 @@ class EuroParlWebstreamIE(InfoExtractor): def _parse_meeting_id(self, display_id): """Extract date and time information from the meeting ID.""" + # Format: YYYYMMDD-HHMM-COMMITTEE-TYPE date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id) if date_match: date_str, time_str, meeting_type = date_match.groups() @@ -196,14 +191,6 @@ class EuroParlWebstreamIE(InfoExtractor): # Calculate a reasonable meeting duration (2 hours by default) end_dt = meeting_dt + datetime.timedelta(hours=2) - # Check if meeting is today or in the future (potential live stream) - now = datetime.datetime.now() - is_today = (meeting_dt.year == now.year and - meeting_dt.month == now.month and - meeting_dt.day == now.day) - is_future = meeting_dt > now - is_recent_past = now - meeting_dt < datetime.timedelta(hours=6) - return { 'date': date_str, 'time': time_str, @@ -212,10 +199,6 @@ class EuroParlWebstreamIE(InfoExtractor): 'end_dt': end_dt, 'start_timestamp': int(meeting_dt.timestamp()), 'end_timestamp': int(end_dt.timestamp()), - 'is_today': is_today, - 'is_future': is_future, - 'is_recent_past': is_recent_past, - 'is_live_candidate': is_today or is_future or is_recent_past, } except (ValueError, OverflowError) as e: self.report_warning(f"Failed to parse meeting date/time: {e}") @@ -225,11 +208,11 @@ class EuroParlWebstreamIE(InfoExtractor): return { 'start_timestamp': current_time - 86400, # 24 hours ago 'end_timestamp': current_time, - 'is_live_candidate': True, # Assume it might be live if we can't parse the time } def _find_m3u8_in_webpage(self, webpage): """Look for m3u8 URLs directly in the webpage.""" + # Look for direct m3u8 URLs with timestamps m3u8_matches = re.findall( r'[\'"]((https?://live\.media\.eup\.glcloud\.eu/[^"\']+\.m3u8(?:\?[^\'"]*)?)[\'"])', webpage @@ -252,40 +235,6 @@ class EuroParlWebstreamIE(InfoExtractor): title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip() return title - def _check_is_live(self, webpage): - """Check if the stream is likely to be live based on webpage content.""" - live_indicators = [ - r'(?i)live\s+now', - r'(?i)streaming\s+live', - r'(?i)watch\s+live', - r'(?i)live\s+stream', - r'(?i)currently\s+live', - r'(?i)livestream', - r'isLive\s*[:=]\s*true', - r'"isLive"\s*:\s*true', - r'data-is-live\s*=\s*["\'](true|1)["\']', - ] - - for indicator in live_indicators: - if re.search(indicator, webpage): - return True - - return False - - def _try_url(self, url, display_id): - """Try a single URL and return formats and subtitles if successful.""" - try: - self.to_screen(f"Trying URL: {url}") - fmt, subs = self._extract_m3u8_formats_and_subtitles( - url, display_id, 'mp4', m3u8_id='hls', fatal=False) - - if fmt: - return fmt, subs - except ExtractorError as e: - self.report_warning(f"Failed with URL {url}: {e}") - - return None, None - def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') @@ -312,104 +261,102 @@ class EuroParlWebstreamIE(InfoExtractor): webpage = self._download_webpage(url, display_id) title = self._extract_title_from_webpage(webpage) - # Check if this is likely to be a live stream - is_live_page = self._check_is_live(webpage) - # First, look for m3u8 URLs directly in the page direct_urls = self._find_m3u8_in_webpage(webpage) if direct_urls: self.to_screen(f"Found {len(direct_urls)} potential stream URLs in webpage") for m3u8_url in direct_urls: - formats, subtitles = self._try_url(m3u8_url, display_id) - if formats: - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live_page, - } + try: + self.to_screen(f"Trying direct URL: {m3u8_url}") + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + + if formats: + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + } + except ExtractorError as e: + self.report_warning(f"Failed with direct URL {m3u8_url}: {e}") - # Parse the meeting ID and check if this is potentially a live stream + # If no direct URLs found, parse the meeting ID and generate likely timestamps meeting_info = self._parse_meeting_id(display_id) start_timestamp = meeting_info.get('start_timestamp') end_timestamp = meeting_info.get('end_timestamp') - is_live_candidate = meeting_info.get('is_live_candidate', False) or is_live_page self.to_screen(f"Generated timestamps for meeting: start={start_timestamp}, end={end_timestamp}") - self.to_screen(f"Stream is likely {'live' if is_live_candidate else 'archived'}") - # First check for live streams if this is a live candidate - if is_live_candidate: - self.to_screen("Checking for live stream URLs first") - - for endpoint in self._ENDPOINTS[:2]: # Only try the first two endpoints for live - for channel in self._CHANNELS[:3]: # Only try the top 3 channels for live - for stream_type in self._LIVE_STREAM_IDS: - # For live streams, try URLs without timestamps - live_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8" - formats, subtitles = self._try_url(live_url, display_id) - - if formats: - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': True, - } + # Try a variety of possibilities, starting with the most likely combinations + formats = [] + subtitles = {} + working_url = None - # Try archived streams with prioritized channels - for channel in self._CHANNELS: - for stream_type in self._ARCHIVE_STREAM_IDS: - # For archived content, include timestamps - archive_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - formats, subtitles = self._try_url(archive_url, display_id) + # Main endpoint with prioritized channels + for channel in self.PRIORITIZED_CHANNELS: + for stream_type in self.KNOWN_STREAM_IDS: + candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + self.to_screen(f"Trying URL: {candidate_url}") - if formats: - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': False, - } - - # If main endpoint + prioritized channels didn't work, try other endpoints - for endpoint in self._ENDPOINTS[1:]: - for channel in self._CHANNELS[:3]: # Only try the top 3 channels for other endpoints - for stream_type in self._ARCHIVE_STREAM_IDS: - archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - formats, subtitles = self._try_url(archive_url, display_id) + try: + fmt, subs = self._extract_m3u8_formats_and_subtitles( + candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - if formats: + if fmt: + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) + working_url = candidate_url + self.to_screen(f"Success! Found working URL: {working_url}") + return { 'id': display_id, 'display_id': display_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'is_live': False, } + except ExtractorError as e: + self.report_warning(f"Failed with URL {candidate_url}: {e}") + + # If main endpoint + prioritized channels didn't work, try other endpoints + for endpoint in self.KNOWN_ENDPOINTS[1:]: # Skip the first one as we already tried it + for channel in self.PRIORITIZED_CHANNELS[:3]: # Only try the top 3 channels for other endpoints + for stream_type in self.KNOWN_STREAM_IDS: + candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + self.to_screen(f"Trying URL: {candidate_url}") + + try: + fmt, subs = self._extract_m3u8_formats_and_subtitles( + candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + + if fmt: + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) + working_url = candidate_url + self.to_screen(f"Success! Found working URL: {working_url}") + + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + } + except ExtractorError as e: + self.report_warning(f"Failed with URL {candidate_url}: {e}") # If we've reached here, we need to give a helpful error message parsed_date = f"{meeting_info.get('date', 'unknown-date')}" parsed_time = f"{meeting_info.get('time', 'unknown-time')}" - # Provide different suggestions based on whether it's likely live or archived - if is_live_candidate: - suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8" - suggestion_text = f"For live streams, try: yt-dlp \"{suggested_url}\"" - else: - suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - suggestion_text = f"For archived content, try: yt-dlp \"{suggested_url}\"" + # Provide the most likely URL for manual use + suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" raise ExtractorError( f"Could not extract stream URL for {display_id}. The European Parliament stream may not be available.\n" - f"Attempted to find a {'live' if is_live_candidate else 'archived'} stream for date: {parsed_date}, time: {parsed_time}.\n" - f"{suggestion_text}", + f"Attempted to find a stream for date: {parsed_date}, time: {parsed_time}.\n" + f"Try using yt-dlp directly with: yt-dlp \"{suggested_url}\"", expected=True )