diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 9d3cfa5a80..c768b9f7d7 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -12,7 +12,8 @@ from ..utils import ( unified_strdate, xpath_text, ) -import re # Import re for findall +# Removed unused 're' import, added 'urllib.parse' for potential future use if needed +# but not strictly required for current modification. # --- EuropaIE (Older extractor - unchanged) --- # This extractor handles older ec.europa.eu/avservices URLs and is likely defunct. @@ -67,14 +68,14 @@ class EuropaIE(InfoExtractor): return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats} -# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) --- +# --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) --- class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) https?://multimedia\.europarl\.europa\.eu/ (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) # Matches /en/webstreaming/event_id format ''' _TESTS = [{ - # Existing VOD test + # Existing VOD test (Should now work better if metadata is consistent) 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY', @@ -82,19 +83,18 @@ class EuroParlWebstreamIE(InfoExtractor): }, 'params': {'skip_download': True}, }, { - # Test case that previously failed with regex method + # Test case likely representing an archive/VOD (based on previous context) 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA', 'info_dict': { 'id': str, # ID might be a string UUID or similar 'display_id': '20250328-1000-SPECIAL-EUROSCOLA', 'ext': 'mp4', 'title': r're:Euroscola', # Expect title containing Euroscola - 'release_timestamp': int, # Expecting a Unix timestamp + 'release_timestamp': int, # Expecting a Unix timestamp (start time) 'release_date': '20250328', - 'is_live': bool, # Could be True (if near event time) or False + 'is_live': False, # Should be detected as not live }, 'params': {'skip_download': True}, - # Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly }] def _real_extract(self, url): @@ -109,70 +109,98 @@ class EuroParlWebstreamIE(InfoExtractor): # Extract basic info, falling back to display_id if metadata is sparse internal_id = media_info.get('id') or display_id title = media_info.get('title') or media_info.get('name') or display_id - release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601})) + + # Extract start and end timestamps, if available + # parse_iso8601 typically returns a float/int timestamp + start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none})) + end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none})) + release_timestamp = start_timestamp # Use start time as the release timestamp + # Determine live status based on metadata hint, if available + # Treat as not live if 'Live' subtype isn't explicitly present is_live = media_info.get('mediaSubType') == 'Live' hls_url = None # Variable to store the found HLS URL # --- Attempt 1: Find direct HLS URL in media_info --- # Check common dictionary keys where the full HLS URL might be stored. - # Add more potential keys here if observed in website data. possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl') hls_url = traverse_obj(media_info, possible_keys) if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL self.to_screen(f'Found direct HLS URL in metadata: {hls_url}') + # Check if it's an archive URL but missing time params - might need correction later if it fails + if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp: + self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.') + hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}' + self.to_screen(f'Corrected direct HLS URL: {hls_url}') + else: - hls_url = None # Reset if found value wasn't an HLS URL + hls_url = None # Reset if found value wasn't an HLS URL or needs construction - # --- Attempt 2: Construct HLS URL from IDs in media_info --- + # --- Attempt 2: Construct HLS URL from IDs and Times in media_info --- if not hls_url: - self.to_screen('Attempting to construct HLS URL from metadata IDs...') - # Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common, - # but might differ. Use traverse_obj to safely get values. - # 'id' from media_info is often the event ID. + self.to_screen('Attempting to construct HLS URL from metadata...') event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id')) - # Channel ID might be numeric or a string name. channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel')) if event_id and channel_id: - # Construct the URL using the assumed live/default pattern. - # For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed. - # This assumes the event is live or uses the default endpoint. - constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8' - hls_url = constructed_url - self.to_screen(f'Constructed potential HLS URL: {hls_url}') + if not is_live and start_timestamp and end_timestamp: + # Construct ARCHIVE/VOD URL with time parameters + constructed_url = ( + f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/' + f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}' + ) + hls_url = constructed_url + self.to_screen(f'Constructed Archive HLS URL: {hls_url}') + elif is_live: + # Construct LIVE URL (basic pattern, might need adjustments) + constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8' + hls_url = constructed_url + self.to_screen(f'Constructed Live HLS URL: {hls_url}') + else: + self.to_screen('Could not construct URL: Missing live status or timestamps for archive.') else: - self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.') + self.to_screen('Could not construct URL: Missing event or channel ID in metadata.') # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) --- if not hls_url: self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...') - m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)' + # Updated regex to potentially capture archive URLs with parameters, but prioritize construction + m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)' hls_url = self._search_regex( m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False) if hls_url: - self.to_screen(f'Found HLS URL via regex fallback: {hls_url}') + self.to_screen(f'Found HLS URL via regex fallback: {hls_url}') + # If regex found an archive URL without params, try adding them as a last resort + if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp: + self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.') + hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}' + self.to_screen(f'Corrected regex HLS URL: {hls_url}') else: - # This is where the original "Could not find any .m3u8 link" warning occurred. - self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.') + self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.') # --- Process HLS Playlist --- if not hls_url: - # If no URL was found after all attempts, raise an error. - raise ExtractorError( - 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.', - expected=True) # expected=True prevents stack trace for common errors - - # Pass the found HLS URL to the HLS processing function. - # The _extract_m3u8_formats function usually detects live/VOD automatically. - # The 'live=is_live' hint can sometimes help but isn't strictly necessary. + raise ExtractorError( + 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.', + expected=True) + + # Pass the final HLS URL to the processing function formats, subtitles = self._extract_m3u8_formats_and_subtitles( - hls_url, display_id, ext='mp4', live=is_live, fatal=False) + hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats # Check if HLS processing returned any formats if not formats: - raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True) + # Try again, forcing VOD interpretation if it was marked live but failed + if is_live: + self.to_screen('Live HLS processing failed, attempting again as VOD...') + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + hls_url, display_id, ext='mp4', live=False, fatal=False) + + # If still no formats, raise error + if not formats: + raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True) + # --- Return Extracted Information --- return { @@ -182,5 +210,5 @@ class EuroParlWebstreamIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'release_timestamp': release_timestamp, - 'is_live': is_live or None, # Use None if not explicitly marked Live + 'is_live': is_live, # Keep original detected live status }