Update europa.py

6 months ago · d597dc61a2
parent db1f9be975
commit d597dc61a2
1 changed files with 66 additions and 38 deletions
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@ -12,7 +12,8 @@ from ..utils import (
    unified_strdate,
    xpath_text,
 )
-import re # Import re for findall
+# Removed unused 're' import, added 'urllib.parse' for potential future use if needed
 # but not strictly required for current modification.
 # --- EuropaIE (Older extractor - unchanged) ---
 # This extractor handles older ec.europa.eu/avservices URLs and is likely defunct.
@ -67,14 +68,14 @@ class EuropaIE(InfoExtractor):
        return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
-# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) ---
+# --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) ---
 class EuroParlWebstreamIE(InfoExtractor):
    _VALID_URL = r'''(?x)
        https?://multimedia\.europarl\.europa\.eu/
        (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format
    '''
    _TESTS = [{
-        # Existing VOD test
+        # Existing VOD test (Should now work better if metadata is consistent)
        'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
        'info_dict': {
            'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY',
@ -82,19 +83,18 @@ class EuroParlWebstreamIE(InfoExtractor):
        },
        'params': {'skip_download': True},
    }, {
-        # Test case that previously failed with regex method
+        # Test case likely representing an archive/VOD (based on previous context)
        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA',
        'info_dict': {
            'id': str, # ID might be a string UUID or similar
            'display_id': '20250328-1000-SPECIAL-EUROSCOLA',
            'ext': 'mp4',
            'title': r're:Euroscola', # Expect title containing Euroscola
-            'release_timestamp': int, # Expecting a Unix timestamp
+            'release_timestamp': int, # Expecting a Unix timestamp (start time)
            'release_date': '20250328',
-            'is_live': bool, # Could be True (if near event time) or False
+            'is_live': False, # Should be detected as not live
        },
        'params': {'skip_download': True},
        # Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly
    }]
    def _real_extract(self, url):
@ -109,70 +109,98 @@ class EuroParlWebstreamIE(InfoExtractor):
        # Extract basic info, falling back to display_id if metadata is sparse
        internal_id = media_info.get('id') or display_id
        title = media_info.get('title') or media_info.get('name') or display_id
-        release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}))
+
        # Extract start and end timestamps, if available
        # parse_iso8601 typically returns a float/int timestamp
        start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none}))
        end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none}))
        release_timestamp = start_timestamp # Use start time as the release timestamp
        # Determine live status based on metadata hint, if available
        # Treat as not live if 'Live' subtype isn't explicitly present
        is_live = media_info.get('mediaSubType') == 'Live'
        hls_url = None # Variable to store the found HLS URL
        # --- Attempt 1: Find direct HLS URL in media_info ---
        # Check common dictionary keys where the full HLS URL might be stored.
        # Add more potential keys here if observed in website data.
        possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl')
        hls_url = traverse_obj(media_info, possible_keys)
        if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL
            self.to_screen(f'Found direct HLS URL in metadata: {hls_url}')
            # Check if it's an archive URL but missing time params - might need correction later if it fails
            if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
                 self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.')
                 hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
                 self.to_screen(f'Corrected direct HLS URL: {hls_url}')
        else:
-            hls_url = None # Reset if found value wasn't an HLS URL
+            hls_url = None # Reset if found value wasn't an HLS URL or needs construction
-        # --- Attempt 2: Construct HLS URL from IDs in media_info ---
+        # --- Attempt 2: Construct HLS URL from IDs and Times in media_info ---
        if not hls_url:
-            self.to_screen('Attempting to construct HLS URL from metadata IDs...')
+            self.to_screen('Attempting to construct HLS URL from metadata...')
            # Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common,
            # but might differ. Use traverse_obj to safely get values.
            # 'id' from media_info is often the event ID.
            event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id'))
            # Channel ID might be numeric or a string name.
            channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel'))
            if event_id and channel_id:
-                # Construct the URL using the assumed live/default pattern.
+                if not is_live and start_timestamp and end_timestamp:
-                # For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed.
+                    # Construct ARCHIVE/VOD URL with time parameters
-                # This assumes the event is live or uses the default endpoint.
+                    constructed_url = (
                        f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/'
                        f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}'
                    )
                    hls_url = constructed_url
                    self.to_screen(f'Constructed Archive HLS URL: {hls_url}')
                elif is_live:
                     # Construct LIVE URL (basic pattern, might need adjustments)
                    constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
                    hls_url = constructed_url
-                self.to_screen(f'Constructed potential HLS URL: {hls_url}')
+                    self.to_screen(f'Constructed Live HLS URL: {hls_url}')
                else:
-                self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.')
+                    self.to_screen('Could not construct URL: Missing live status or timestamps for archive.')
            else:
                self.to_screen('Could not construct URL: Missing event or channel ID in metadata.')
        # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) ---
        if not hls_url:
            self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...')
-            m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)'
+            # Updated regex to potentially capture archive URLs with parameters, but prioritize construction
            m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)'
            hls_url = self._search_regex(
                m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False)
            if hls_url:
                self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
                # If regex found an archive URL without params, try adding them as a last resort
                if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
                    self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.')
                    hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
                    self.to_screen(f'Corrected regex HLS URL: {hls_url}')
            else:
                # This is where the original "Could not find any .m3u8 link" warning occurred.
                self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
        # --- Process HLS Playlist ---
        if not hls_url:
            # If no URL was found after all attempts, raise an error.
            raise ExtractorError(
                'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
-                 expected=True) # expected=True prevents stack trace for common errors
+                expected=True)
-        # Pass the found HLS URL to the HLS processing function.
+        # Pass the final HLS URL to the processing function
        # The _extract_m3u8_formats function usually detects live/VOD automatically.
        # The 'live=is_live' hint can sometimes help but isn't strictly necessary.
        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-            hls_url, display_id, ext='mp4', live=is_live, fatal=False)
+            hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats
        # Check if HLS processing returned any formats
        if not formats:
-             raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True)
+             # Try again, forcing VOD interpretation if it was marked live but failed
             if is_live:
                 self.to_screen('Live HLS processing failed, attempting again as VOD...')
                 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
                     hls_url, display_id, ext='mp4', live=False, fatal=False)
             # If still no formats, raise error
             if not formats:
                 raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True)
        # --- Return Extracted Information ---
        return {
@ -182,5 +210,5 @@ class EuroParlWebstreamIE(InfoExtractor):
            'formats': formats,
            'subtitles': subtitles,
            'release_timestamp': release_timestamp,
-            'is_live': is_live or None, # Use None if not explicitly marked Live
+            'is_live': is_live, # Keep original detected live status
        }