Update europa.py

6 months ago · fe08c6ca27
parent 43ba015d27
commit fe08c6ca27
1 changed files with 205 additions and 146 deletions
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@ -1,3 +1,4 @@
+# coding: utf-8
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
@ -103,7 +104,7 @@ class EuroParlWebstreamIE(InfoExtractor):
    _VALID_URL = r'''(?x)
        https?://(?:
            multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)|
-            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>\d+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w-]+)(?:\.m3u8|/master\.m3u8)
+            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>[\w-]+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows
        )
    '''
    _TESTS = [{
@ -118,10 +119,21 @@ class EuroParlWebstreamIE(InfoExtractor):
            'skip_download': True,
        },
    }, {
-        # Direct HLS stream URL
-        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime=1742828675&endTime=1742832870',
+        # Direct HLS stream URL (archive example similar to user provided)
+        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442',
        'info_dict': {
-            'id': 'index-archive',
+            'id': 'norsk-archive', # ID derived from filename before query
+            'ext': 'mp4',
+            'title': 'European Parliament Stream',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    },{
+        # Direct HLS stream URL (live example)
+        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8',
+        'info_dict': {
+            'id': 'index',
            'ext': 'mp4',
            'title': 'European Parliament Stream',
        },
@ -130,43 +142,53 @@ class EuroParlWebstreamIE(InfoExtractor):
        },
    }]

-    # Main CDN endpoint - primarily target this instead of trying multiple
-    MAIN_ENDPOINT = "2113753"
+    # Known CDN endpoints - try these if direct extraction fails
+    # Added 2113713 and 2113713-b based on user's M3U8
+    ENDPOINTS = ["2113753", "2113713", "2113713-b"]

-    # Priority channels based on observed success rates
-    PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-bxl", "channel-10-bxl"]
+    # Priority channels based on observed success rates & user M3U8
+    # Added channel-01-stb
+    PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"]

-    # Default stream types by content type
-    LIVE_STREAM_TYPES = ["index", "master", "playlist"]
-    ARCHIVE_STREAM_TYPES = ["index-archive", "norsk-archive", "index", "master"]
+    # Default stream types/filenames by content type
+    # These are used in the *fallback* guessing logic.
+    # The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed.
+    LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"]
+    ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"]

    def _extract_direct_url_from_webpage(self, webpage):
        """Extract direct m3u8 URLs from webpage with minimal logging"""
-        m3u8_urls = []
+        m3u8_urls = set() # Use a set to avoid duplicates

        # Search patterns for m3u8 URLs
+        # Added more flexibility for quotes and paths
        for pattern in [
-            r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\']*)?)["\']',
+            r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']',
            r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
-            r'=[^\n]*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']',
+            # Look for assignments or attributes
+            r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']',
+            # Look for URLs within JSON-like structures in script tags
+            r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
+            r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
        ]:
            matches = re.findall(pattern, webpage)
-            if matches:
-                m3u8_urls.extend(matches)
-        
-        # Clean up URLs
-        clean_urls = []
-        for url in m3u8_urls:
+            for match in matches:
+                # Handle potential tuple results from findall if multiple groups exist in regex
+                url_match = match if isinstance(match, str) else match[0]
+                # Basic sanity check
+                if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match:
                    # Remove any JS string escaping
-            url = url.replace('\\/', '/').replace('\\\\', '\\')
-            clean_urls.append(url)
+                    url_match = url_match.replace('\\/', '/').replace('\\\\', '\\')
+                    m3u8_urls.add(url_match)

-        # Extract from network panel if available
+        # Extract from network panel if available (less reliable parsing)
        network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE)
        if network_url_match:
-            clean_urls.append(network_url_match.group(1))
+            url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\')
+            m3u8_urls.add(url_match)

-        return clean_urls
+        self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage')
+        return list(m3u8_urls)

    def _extract_title_from_webpage(self, webpage, display_id):
        """Extract the title from the webpage"""
@ -174,6 +196,7 @@ class EuroParlWebstreamIE(InfoExtractor):
        for pattern in [
            r'<meta property="og:title" content="([^"]+)"',
            r'<title>([^<]+)</title>',
+            r'<h1[^>]*class="erpl_title-h1"[^>]*>([^<]+)</h1>', # Specific title class
            r'<h1[^>]*>([^<]+)</h1>',
            r'"title"\s*:\s*"([^"]+)"',
        ]:
@ -181,17 +204,18 @@ class EuroParlWebstreamIE(InfoExtractor):
            if title_match:
                title = title_match.group(1).strip()
                # Clean up common suffixes
-                title = re.sub(r'\s*\|\s*European Parliament$', '', title)
-                title = re.sub(r'\s*-\s*Multimedia Centre$', '', title)
+                title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
+                title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip()
+                if title:
                    return title

-        return f"European Parliament Session - {display_id}"
+        return f"European Parliament Session - {display_id}" # Fallback title

    def _parse_meeting_date(self, display_id):
        """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)"""
        date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
        if date_match:
-            date_str, time_str, meeting_type = date_match.groups()
+            date_str, time_str, _ = date_match.groups()
            try:
                # Parse the date components
                year = int(date_str[:4])
@ -200,163 +224,198 @@ class EuroParlWebstreamIE(InfoExtractor):
                hour = int(time_str[:2])
                minute = int(time_str[2:4])

-                # Create timestamps with a generous window (3 hours before and after)
-                meeting_dt = datetime.datetime(year, month, day, hour, minute)
+                # Create timestamps with a generous window (e.g., 3 hours before, 6 hours after)
+                # This helps catch streams that start slightly early or run long
+                meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC
                start_dt = meeting_dt - datetime.timedelta(hours=3)
-                end_dt = meeting_dt + datetime.timedelta(hours=6)
+                end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window

-                # Convert to timestamps
+                # Convert to Unix timestamps
                start_ts = int(start_dt.timestamp())
                end_ts = int(end_dt.timestamp())

+                self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}')
                return start_ts, end_ts

-            except (ValueError, OverflowError):
-                pass
+            except (ValueError, OverflowError) as e:
+                 self.to_screen(f'Error parsing date from display_id "{display_id}": {e}')
+                 pass # Fall through to fallback

-        # Fallback to a recent 48-hour window
+        # Fallback to a recent window if parsing fails or ID format is different
+        self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.')
        now = int(time.time())
-        start_time = now - (48 * 3600)  # 48 hours ago
-        return start_time, now
+        start_time = now - (24 * 3600)  # 24 hours ago (might be too short for older archives)
+        end_time = now + (1 * 3600)      # 1 hour in the future (for live/recent)
+        return start_time, end_time

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
+        # Get potential IDs from the regex match groups
        display_id = mobj.group('id')
        live_id = mobj.group('live_id')
        stream_id = mobj.group('stream_id')
        channel = mobj.group('channel')

-        # Handle direct HLS URLs
-        if live_id and stream_id:
-            # Remove query parameters from stream_id if present
-            clean_stream_id = stream_id.split('?')[0] if '?' in stream_id else stream_id
+        # Use the most specific ID available
+        video_id = display_id or stream_id or live_id or channel
+
+        # Handle direct HLS URLs first (most reliable if provided)
+        if live_id and (stream_id or channel):
+            # Clean up stream_id (remove query parameters for use as info dict id)
+            clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id
+            # If stream_id is missing but channel exists, use channel as part of the id
+            final_id = clean_stream_id or channel or 'unknown_stream'
+            # Remove potential .m3u8 suffix for cleaner ID
+            if final_id.endswith('.m3u8'):
+                 final_id = final_id[:-5]

+            self.to_screen(f'Processing direct HLS URL: {url}')
            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                url, clean_stream_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True)
+                url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues
+
+            if not formats:
+                 self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}')
+                 # Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work
+                 # raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True)

            return {
-                'id': clean_stream_id,
-                'title': 'European Parliament Stream',
-                'formats': formats,
-                'subtitles': subtitles,
+                'id': final_id,
+                'title': 'European Parliament Stream', # Generic title for direct URLs
+                'formats': formats or [],
+                'subtitles': subtitles or {},
+                'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL
            }

-        # Download the webpage for standard europarl URLs
+        # --- Fallback for multimedia.europarl.europa.eu URLs ---
+        if not display_id: # Should have display_id if it's not a direct HLS URL
+             raise ExtractorError('Failed to identify video ID from URL.')
+
+        self.to_screen(f'Processing webpage URL: {url}')
        webpage = self._download_webpage(url, display_id)

-        # Check for live indicators
-        is_live = bool(re.search(r'(?:isLive|livestream|live-stream|\"live\"\s*:\s*true)', webpage, re.IGNORECASE))
+        # Check for live indicators more reliably
+        # Look for common live indicators in JS, classes, or text
+        is_live = bool(re.search(
+            r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)',
+            webpage,
+            re.IGNORECASE))
+        self.to_screen(f'Detected as live: {is_live}')

        # Extract title
        title = self._extract_title_from_webpage(webpage, display_id)

-        # First try direct URLs from the webpage (this is the most reliable approach)
+        # *** Strategy 1: Extract direct URLs from webpage (Preferred) ***
        direct_urls = self._extract_direct_url_from_webpage(webpage)
-        
-        # Track whether we successfully found a stream
        formats = []
        subtitles = {}

        if direct_urls:
+            self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...')
            for m3u8_url in direct_urls:
+                # Clean stream ID from URL for format identification
+                m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0]
+                if m3u8_stream_id.endswith('.m3u8'):
+                    m3u8_stream_id = m3u8_stream_id[:-5]
+
                try:
                    fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                        m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                        m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error

                    if fmt:
+                        self.to_screen(f'Successfully extracted formats from: {m3u8_url}')
                        formats.extend(fmt)
                        self._merge_subtitles(subs, target=subtitles)
-                        
+                        # If we found formats, we are likely done, return immediately
                        return {
                            'id': display_id,
                            'display_id': display_id,
                            'title': title,
                            'formats': formats,
                            'subtitles': subtitles,
-                            'is_live': is_live,
+                            'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL
                        }
-                except ExtractorError:
-                    pass
-        
-        # Parse timestamps for archive retrieval (or use current time for live)
-        if is_live:
-            # For live streams, we don't need timestamps
-            start_timestamp, end_timestamp = None, None
                    else:
-            start_timestamp, end_timestamp = self._parse_meeting_date(display_id)
-        
-        # Use appropriate stream types for the content type
-        stream_types = self.LIVE_STREAM_TYPES if is_live else self.ARCHIVE_STREAM_TYPES
+                        self.to_screen(f'No formats found in: {m3u8_url}')
+                except ExtractorError as e:
+                    self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}')
+                    pass # Try the next direct URL
+        else:
+            self.to_screen('No direct M3U8 URLs found in webpage.')

-        # Try combinations with improved targeting
-        for channel in self.PRIORITY_CHANNELS:
-            for stream_type in stream_types:
-                # For live streams, try without timestamps first
-                if is_live:
-                    live_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8"

-                    try:
-                        fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                            live_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+        # *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) ***
+        self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...')

-                        if fmt:
-                            formats.extend(fmt)
-                            self._merge_subtitles(subs, target=subtitles)
+        # Parse timestamps for archive retrieval (or use a window for live/unknown)
+        # Always parse, even if live, as it might be a recently finished live event
+        start_timestamp, end_timestamp = self._parse_meeting_date(display_id)

-                            return {
-                                'id': display_id,
-                                'display_id': display_id,
-                                'title': title,
-                                'formats': formats,
-                                'subtitles': subtitles,
-                                'is_live': True,
-                            }
-                    except ExtractorError:
-                        pass
+        # Use appropriate stream filenames for the content type
+        stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES
+
+        # Try combinations with updated endpoints and channels
+        for endpoint in self.ENDPOINTS:
+            for channel_to_try in self.PRIORITY_CHANNELS:
+                for filename in stream_filenames:
+                    base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}"
+
+                    # Determine if timestamps should be added
+                    # Add timestamps if it's explicitly not live, OR if the filename suggests archive,
+                    # OR if start/end timestamps were successfully parsed from the ID.
+                    # Avoid timestamps for clearly live filenames unless forced by non-live status.
+                    use_timestamps = (
+                        (not is_live or 'archive' in filename.lower())
+                        and start_timestamp and end_timestamp
+                    )

-                # For archived content (or as fallback for live), try with timestamps
-                if start_timestamp and end_timestamp:
-                    archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+                    test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url

                    try:
+                        self.to_screen(f'Trying guessed URL: {test_url}')
                        fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                            archive_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                            test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False)

                        if fmt:
+                            self.to_screen(f'Success with guessed URL: {test_url}')
                            formats.extend(fmt)
                            self._merge_subtitles(subs, target=subtitles)
-                            
+                            # Found a working combination
                            return {
                                'id': display_id,
                                'display_id': display_id,
                                'title': title,
                                'formats': formats,
                                'subtitles': subtitles,
-                                'is_live': False,
+                                'is_live': not use_timestamps, # If we used timestamps, assume not live
                            }
-                    except ExtractorError:
-                        pass
-        
-        # Provide helpful error with the most likely working URLs
-        suggested_urls = []
-        
-        # Add the URLs that are most likely to work based on the logs and screenshots
-        if start_timestamp and end_timestamp:
-            suggested_urls.extend([
-                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}",
-                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-            ])
                        else:
-            suggested_urls.extend([
-                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index.m3u8",
-                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index.m3u8"
-            ])
-        
-        suggestions = "\n".join([f"yt-dlp \"{url}\"" for url in suggested_urls])
-        
-        raise ExtractorError(
-            f"Could not extract stream URL for {display_id or url}. The European Parliament stream may not be available.\n"
-            f"Live stream detected: {is_live}\n"
-            f"Try using yt-dlp directly with one of these URLs:\n{suggestions}",
-            expected=True
+                            self.to_screen(f'No formats found in guessed URL: {test_url}')
+
+                    except ExtractorError as e:
+                        # Log error lightly, as many guesses are expected to fail
+                        self.to_screen(f'Guessed URL failed: {test_url} ({e})')
+                        pass # Continue trying other combinations
+
+        # *** If all strategies fail ***
+        self.to_screen('All extraction strategies failed.')
+
+        # Provide helpful error with suggestions
+        error_message = (
+            f"Could not extract stream URL for {display_id or url}. "
+            "The stream may be old, expired, or use an unsupported format.\n"
+            f"Live status detected: {is_live}\n"
+            "Common issues:\n"
+            "- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n"
+            "- The event might not be available via the standard CDN endpoints/channels.\n"
+            "If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n"
+            "Example (using parsed times, adjust if needed):\n"
        )
+        if start_timestamp and end_timestamp:
+             example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+             error_message += f'yt-dlp "{example_url}"'
+        else:
+             example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8"
+             error_message += f'yt-dlp "{example_url}"'
+
+
+        raise ExtractorError(error_message, expected=True)