From b652a8a6b14b61019e7638f0e6abc3c3a580b44a Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Tue, 25 Mar 2025 14:25:17 +0000
Subject: [PATCH 1/9] Update europa.py

[europarl] Update extractor to support new stream URLs
- Add support for live.media.eup.glcloud.eu direct HLS streams
- Add live stream detection and handling without timestamps
- Prioritise channel-07-bxl which is commonly used
---
 yt_dlp/extractor/europa.py | 357 ++++++++++++++++++++++++++++++-------
 1 file changed, 291 insertions(+), 66 deletions(-)
diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index aa8baf2f78..7470305e95 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -9,7 +9,11 @@ from ..utils import (
     traverse_obj,
     unified_strdate,
     xpath_text,
+    ExtractorError,
 )
+import re
+import datetime
+import time
 
 
 class EuropaIE(InfoExtractor):
@@ -94,97 +98,318 @@ class EuropaIE(InfoExtractor):
 
 class EuroParlWebstreamIE(InfoExtractor):
     _VALID_URL = r'''(?x)
-        https?://multimedia\.europarl\.europa\.eu/
-        (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)
+        https?://(?:
+            multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)|
+            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>\d+)/(?:channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w-]+)(?:\.m3u8|/master\.m3u8)
+        )
     '''
     _TESTS = [{
         'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
         'info_dict': {
-            'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d',
+            'id': '20220914-0900-PLENARY',
             'display_id': '20220914-0900-PLENARY',
             'ext': 'mp4',
             'title': 'Plenary session',
-            'release_timestamp': 1663139069,
-            'release_date': '20220914',
         },
         'params': {
             'skip_download': True,
         },
     }, {
-        # live webstream
-        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA',
+        # New URL format for direct HLS streams
+        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime=1742828675&endTime=1742832870',
         'info_dict': {
+            'id': 'index-archive',
             'ext': 'mp4',
-            'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715',
-            'release_timestamp': 1668502800,
-            'title': 'Euroscola 2022-11-15 19:21',
-            'release_date': '20221115',
-            'live_status': 'is_live',
+            'title': 'European Parliament Stream',
         },
-        'skip': 'not live anymore',
-    }, {
-        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT',
-        'info_dict': {
-            'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7',
-            'display_id': '20230301-1130-COMMITTEE-CULT',
-            'ext': 'mp4',
-            'release_date': '20230301',
-            'title': 'Committee on Culture and Education',
-            'release_timestamp': 1677666641,
+        'params': {
+            'skip_download': True,
         },
     }, {
-        # live stream
-        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI',
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/special-committee-on-housing-crisis-in-european-union-ordinary-meeting_20250324-1500-COMMITTEE-HOUS',
         'info_dict': {
-            'id': 'e4255f56-10aa-4b3c-6530-08db56d5b0d9',
+            'id': '20250324-1500-COMMITTEE-HOUS',
+            'display_id': '20250324-1500-COMMITTEE-HOUS',
             'ext': 'mp4',
-            'release_date': '20230524',
-            'title': r're:Committee on Environment, Public Health and Food Safety \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}',
-            'release_timestamp': 1684911541,
-            'live_status': 'is_live',
+            'title': 'Special committee on the Housing Crisis in the European Union Ordinary meeting',
+            'is_live': False,
         },
-        'skip': 'Not live anymore',
-    }, {
-        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20240320-1345-SPECIAL-PRESSER',
-        'info_dict': {
-            'id': 'c1f11567-5b52-470a-f3e1-08dc3c216ace',
-            'display_id': '20240320-1345-SPECIAL-PRESSER',
-            'ext': 'mp4',
-            'release_date': '20240320',
-            'title': 'md5:7c6c814cac55dea5e2d87bf8d3db2234',
-            'release_timestamp': 1710939767,
+        'params': {
+            'skip_download': True,
         },
-    }, {
-        'url': 'https://multimedia.europarl.europa.eu/webstreaming/briefing-for-media-on-2024-european-elections_20240429-1000-SPECIAL-OTHER',
-        'only_matching': True,
     }]
 
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
+    # Known working stream IDs (in order of likely success)
+    _ARCHIVE_STREAM_IDS = [
+        "index-archive",
+        "norsk-archive",
+    ]
+    
+    # Live stream IDs
+    _LIVE_STREAM_IDS = [
+        "index",
+        "master",
+        "playlist",
+        "norsk",
+    ]
 
-        webpage_nextjs = self._search_nextjs_data(webpage, display_id)['props']['pageProps']
+    # Known CDN endpoints (in order of likely success)
+    _ENDPOINTS = [
+        "2113753",  # This appears to be the main endpoint
+        "2113749",
+        "2113750",
+        "2113751",
+        "2113752",
+        "2113754",
+    ]
 
-        json_info = self._download_json(
-            'https://acs-api.europarl.connectedviews.eu/api/FullMeeting', display_id,
-            query={
-                'api-version': 1.0,
-                'tenantId': 'bae646ca-1fc8-4363-80ba-2c04f06b4968',
-                'externalReference': display_id,
-            })
-
-        formats, subtitles = [], {}
-        for hls_url in traverse_obj(json_info, ((('meetingVideo'), ('meetingVideos', ...)), 'hlsUrl')):
-            fmt, subs = self._extract_m3u8_formats_and_subtitles(hls_url, display_id)
-            formats.extend(fmt)
-            self._merge_subtitles(subs, target=subtitles)
+    # Prioritized channel list based on observations (channel-07-bxl is often used)
+    _CHANNELS = [
+        "channel-07-bxl",  # Most common based on examples
+        "channel-03-bxl",  # Also seen in examples
+        "channel-01-bxl",
+        "channel-02-bxl",
+        "channel-04-bxl",
+        "channel-05-bxl",
+        "channel-06-bxl",
+        "channel-08-bxl",
+        "channel-09-bxl",
+        "channel-10-bxl",
+    ]
 
+    def _parse_meeting_id(self, display_id):
+        """Extract date and time information from the meeting ID."""
+        date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
+        if date_match:
+            date_str, time_str, meeting_type = date_match.groups()
+            try:
+                # Parse the date and time
+                year = int(date_str[:4])
+                month = int(date_str[4:6])
+                day = int(date_str[6:8])
+                hour = int(time_str[:2])
+                minute = int(time_str[2:4])
+                
+                # Create datetime object
+                meeting_dt = datetime.datetime(year, month, day, hour, minute)
+                
+                # Calculate a reasonable meeting duration (2 hours by default)
+                end_dt = meeting_dt + datetime.timedelta(hours=2)
+                
+                # Check if meeting is today or in the future (potential live stream)
+                now = datetime.datetime.now()
+                is_today = (meeting_dt.year == now.year and 
+                           meeting_dt.month == now.month and 
+                           meeting_dt.day == now.day)
+                is_future = meeting_dt > now
+                is_recent_past = now - meeting_dt < datetime.timedelta(hours=6)
+                
+                return {
+                    'date': date_str,
+                    'time': time_str,
+                    'type': meeting_type,
+                    'start_dt': meeting_dt,
+                    'end_dt': end_dt,
+                    'start_timestamp': int(meeting_dt.timestamp()),
+                    'end_timestamp': int(end_dt.timestamp()),
+                    'is_today': is_today,
+                    'is_future': is_future,
+                    'is_recent_past': is_recent_past,
+                    'is_live_candidate': is_today or is_future or is_recent_past,
+                }
+            except (ValueError, OverflowError) as e:
+                self.report_warning(f"Failed to parse meeting date/time: {e}")
+        
+        # If we can't parse the date/time, use the current time minus 24 hours to now
+        current_time = int(time.time())
         return {
-            'id': json_info['id'],
-            'display_id': display_id,
-            'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False),
-            'formats': formats,
-            'subtitles': subtitles,
-            'release_timestamp': parse_iso8601(json_info.get('startDateTime')),
-            'is_live': traverse_obj(webpage_nextjs, ('mediaItem', 'mediaSubType')) == 'Live',
+            'start_timestamp': current_time - 86400,  # 24 hours ago
+            'end_timestamp': current_time,
+            'is_live_candidate': True,  # Assume it might be live if we can't parse the time
         }
+
+    def _find_m3u8_in_webpage(self, webpage):
+        """Look for m3u8 URLs directly in the webpage."""
+        m3u8_matches = re.findall(
+            r'[\'"]((https?://live\.media\.eup\.glcloud\.eu/[^"\']+\.m3u8(?:\?[^\'"]*)?)[\'"])',
+            webpage
+        )
+        if m3u8_matches:
+            return [url[0].replace('\\/', '/').replace('\\\\', '\\') for url in m3u8_matches]
+        
+        return []
+
+    def _extract_title_from_webpage(self, webpage):
+        """Extract the title from the webpage."""
+        title = self._html_search_regex(
+            r'<meta property="og:title" content="([^"]+)"',
+            webpage, 'title', default=None) or \
+            self._html_search_regex(
+            r'<title>([^<]+)</title>',
+            webpage, 'title', default='European Parliament Stream')
+        
+        # Clean up title
+        title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
+        return title
+
+    def _check_is_live(self, webpage):
+        """Check if the stream is likely to be live based on webpage content."""
+        live_indicators = [
+            r'(?i)live\s+now',
+            r'(?i)streaming\s+live',
+            r'(?i)watch\s+live',
+            r'(?i)live\s+stream',
+            r'(?i)currently\s+live',
+            r'(?i)livestream',
+            r'isLive\s*[:=]\s*true',
+            r'"isLive"\s*:\s*true',
+            r'data-is-live\s*=\s*["\'](true|1)["\']',
+        ]
+        
+        for indicator in live_indicators:
+            if re.search(indicator, webpage):
+                return True
+        
+        return False
+
+    def _try_url(self, url, display_id):
+        """Try a single URL and return formats and subtitles if successful."""
+        try:
+            self.to_screen(f"Trying URL: {url}")
+            fmt, subs = self._extract_m3u8_formats_and_subtitles(
+                url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+            
+            if fmt:
+                return fmt, subs
+        except ExtractorError as e:
+            self.report_warning(f"Failed with URL {url}: {e}")
+        
+        return None, None
+
+    def _real_extract(self, url):
+        mobj = self._match_valid_url(url)
+        display_id = mobj.group('id')
+        live_id = mobj.group('live_id')
+        stream_id = mobj.group('stream_id')
+
+        # Handle direct HLS stream URLs
+        if live_id and stream_id:
+            # Strip any query parameters from stream_id
+            if '?' in stream_id:
+                stream_id = stream_id.split('?')[0]
+            
+            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+                url, stream_id, 'mp4', m3u8_id='hls', fatal=False)
+
+            return {
+                'id': stream_id,
+                'title': 'European Parliament Stream',
+                'formats': formats,
+                'subtitles': subtitles,
+            }
+
+        # If we're dealing with a europarl.europa.eu URL, download the webpage first
+        webpage = self._download_webpage(url, display_id)
+        title = self._extract_title_from_webpage(webpage)
+        
+        # Check if this is likely to be a live stream
+        is_live_page = self._check_is_live(webpage)
+        
+        # First, look for m3u8 URLs directly in the page
+        direct_urls = self._find_m3u8_in_webpage(webpage)
+        if direct_urls:
+            self.to_screen(f"Found {len(direct_urls)} potential stream URLs in webpage")
+            for m3u8_url in direct_urls:
+                formats, subtitles = self._try_url(m3u8_url, display_id)
+                if formats:
+                    return {
+                        'id': display_id,
+                        'display_id': display_id,
+                        'title': title,
+                        'formats': formats,
+                        'subtitles': subtitles,
+                        'is_live': is_live_page,
+                    }
+        
+        # Parse the meeting ID and check if this is potentially a live stream
+        meeting_info = self._parse_meeting_id(display_id)
+        start_timestamp = meeting_info.get('start_timestamp')
+        end_timestamp = meeting_info.get('end_timestamp')
+        is_live_candidate = meeting_info.get('is_live_candidate', False) or is_live_page
+        
+        self.to_screen(f"Generated timestamps for meeting: start={start_timestamp}, end={end_timestamp}")
+        self.to_screen(f"Stream is likely {'live' if is_live_candidate else 'archived'}")
+        
+        # First check for live streams if this is a live candidate
+        if is_live_candidate:
+            self.to_screen("Checking for live stream URLs first")
+            
+            for endpoint in self._ENDPOINTS[:2]:  # Only try the first two endpoints for live
+                for channel in self._CHANNELS[:3]:  # Only try the top 3 channels for live
+                    for stream_type in self._LIVE_STREAM_IDS:
+                        # For live streams, try URLs without timestamps
+                        live_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8"
+                        formats, subtitles = self._try_url(live_url, display_id)
+                        
+                        if formats:
+                            return {
+                                'id': display_id,
+                                'display_id': display_id,
+                                'title': title,
+                                'formats': formats,
+                                'subtitles': subtitles,
+                                'is_live': True,
+                            }
+        
+        # Try archived streams with prioritized channels
+        for channel in self._CHANNELS:
+            for stream_type in self._ARCHIVE_STREAM_IDS:
+                # For archived content, include timestamps
+                archive_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+                formats, subtitles = self._try_url(archive_url, display_id)
+                
+                if formats:
+                    return {
+                        'id': display_id,
+                        'display_id': display_id,
+                        'title': title,
+                        'formats': formats,
+                        'subtitles': subtitles,
+                        'is_live': False,
+                    }
+        
+        # If main endpoint + prioritized channels didn't work, try other endpoints
+        for endpoint in self._ENDPOINTS[1:]:
+            for channel in self._CHANNELS[:3]:  # Only try the top 3 channels for other endpoints
+                for stream_type in self._ARCHIVE_STREAM_IDS:
+                    archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+                    formats, subtitles = self._try_url(archive_url, display_id)
+                    
+                    if formats:
+                        return {
+                            'id': display_id,
+                            'display_id': display_id,
+                            'title': title,
+                            'formats': formats,
+                            'subtitles': subtitles,
+                            'is_live': False,
+                        }
+        
+        # If we've reached here, we need to give a helpful error message
+        parsed_date = f"{meeting_info.get('date', 'unknown-date')}"
+        parsed_time = f"{meeting_info.get('time', 'unknown-time')}"
+        
+        # Provide different suggestions based on whether it's likely live or archived
+        if is_live_candidate:
+            suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8"
+            suggestion_text = f"For live streams, try: yt-dlp \"{suggested_url}\""
+        else:
+            suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+            suggestion_text = f"For archived content, try: yt-dlp \"{suggested_url}\""
+        
+        raise ExtractorError(
+            f"Could not extract stream URL for {display_id}. The European Parliament stream may not be available.\n"
+            f"Attempted to find a {'live' if is_live_candidate else 'archived'} stream for date: {parsed_date}, time: {parsed_time}.\n"
+            f"{suggestion_text}",
+            expected=True
+        )

From 2afe7e2fa3b21ab6f6879a05a06043eedbc864eb Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Tue, 25 Mar 2025 16:02:51 +0000
Subject: [PATCH 2/9] Update europa.py

this makes it work with videos from the archive but not live videos
---
 yt_dlp/extractor/europa.py | 205 ++++++++++++++-----------------------
 1 file changed, 76 insertions(+), 129 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index 7470305e95..c3a03bf591 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -10,10 +10,13 @@ from ..utils import (
     unified_strdate,
     xpath_text,
     ExtractorError,
+    js_to_json,
+    urljoin
 )
 import re
-import datetime
+import json
 import time
+import datetime
 
 
 class EuropaIE(InfoExtractor):
@@ -132,7 +135,6 @@ class EuroParlWebstreamIE(InfoExtractor):
             'display_id': '20250324-1500-COMMITTEE-HOUS',
             'ext': 'mp4',
             'title': 'Special committee on the Housing Crisis in the European Union Ordinary meeting',
-            'is_live': False,
         },
         'params': {
             'skip_download': True,
@@ -140,21 +142,13 @@ class EuroParlWebstreamIE(InfoExtractor):
     }]
 
     # Known working stream IDs (in order of likely success)
-    _ARCHIVE_STREAM_IDS = [
+    KNOWN_STREAM_IDS = [
         "index-archive",
         "norsk-archive",
     ]
-    
-    # Live stream IDs
-    _LIVE_STREAM_IDS = [
-        "index",
-        "master",
-        "playlist",
-        "norsk",
-    ]
 
     # Known CDN endpoints (in order of likely success)
-    _ENDPOINTS = [
+    KNOWN_ENDPOINTS = [
         "2113753",  # This appears to be the main endpoint
         "2113749",
         "2113750",
@@ -164,7 +158,7 @@ class EuroParlWebstreamIE(InfoExtractor):
     ]
 
     # Prioritized channel list based on observations (channel-07-bxl is often used)
-    _CHANNELS = [
+    PRIORITIZED_CHANNELS = [
         "channel-07-bxl",  # Most common based on examples
         "channel-03-bxl",  # Also seen in examples
         "channel-01-bxl",
@@ -179,6 +173,7 @@ class EuroParlWebstreamIE(InfoExtractor):
 
     def _parse_meeting_id(self, display_id):
         """Extract date and time information from the meeting ID."""
+        # Format: YYYYMMDD-HHMM-COMMITTEE-TYPE
         date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
         if date_match:
             date_str, time_str, meeting_type = date_match.groups()
@@ -196,14 +191,6 @@ class EuroParlWebstreamIE(InfoExtractor):
                 # Calculate a reasonable meeting duration (2 hours by default)
                 end_dt = meeting_dt + datetime.timedelta(hours=2)
                 
-                # Check if meeting is today or in the future (potential live stream)
-                now = datetime.datetime.now()
-                is_today = (meeting_dt.year == now.year and 
-                           meeting_dt.month == now.month and 
-                           meeting_dt.day == now.day)
-                is_future = meeting_dt > now
-                is_recent_past = now - meeting_dt < datetime.timedelta(hours=6)
-                
                 return {
                     'date': date_str,
                     'time': time_str,
@@ -212,10 +199,6 @@ class EuroParlWebstreamIE(InfoExtractor):
                     'end_dt': end_dt,
                     'start_timestamp': int(meeting_dt.timestamp()),
                     'end_timestamp': int(end_dt.timestamp()),
-                    'is_today': is_today,
-                    'is_future': is_future,
-                    'is_recent_past': is_recent_past,
-                    'is_live_candidate': is_today or is_future or is_recent_past,
                 }
             except (ValueError, OverflowError) as e:
                 self.report_warning(f"Failed to parse meeting date/time: {e}")
@@ -225,11 +208,11 @@ class EuroParlWebstreamIE(InfoExtractor):
         return {
             'start_timestamp': current_time - 86400,  # 24 hours ago
             'end_timestamp': current_time,
-            'is_live_candidate': True,  # Assume it might be live if we can't parse the time
         }
 
     def _find_m3u8_in_webpage(self, webpage):
         """Look for m3u8 URLs directly in the webpage."""
+        # Look for direct m3u8 URLs with timestamps
         m3u8_matches = re.findall(
             r'[\'"]((https?://live\.media\.eup\.glcloud\.eu/[^"\']+\.m3u8(?:\?[^\'"]*)?)[\'"])',
             webpage
@@ -252,40 +235,6 @@ class EuroParlWebstreamIE(InfoExtractor):
         title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
         return title
 
-    def _check_is_live(self, webpage):
-        """Check if the stream is likely to be live based on webpage content."""
-        live_indicators = [
-            r'(?i)live\s+now',
-            r'(?i)streaming\s+live',
-            r'(?i)watch\s+live',
-            r'(?i)live\s+stream',
-            r'(?i)currently\s+live',
-            r'(?i)livestream',
-            r'isLive\s*[:=]\s*true',
-            r'"isLive"\s*:\s*true',
-            r'data-is-live\s*=\s*["\'](true|1)["\']',
-        ]
-        
-        for indicator in live_indicators:
-            if re.search(indicator, webpage):
-                return True
-        
-        return False
-
-    def _try_url(self, url, display_id):
-        """Try a single URL and return formats and subtitles if successful."""
-        try:
-            self.to_screen(f"Trying URL: {url}")
-            fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                url, display_id, 'mp4', m3u8_id='hls', fatal=False)
-            
-            if fmt:
-                return fmt, subs
-        except ExtractorError as e:
-            self.report_warning(f"Failed with URL {url}: {e}")
-        
-        return None, None
-
     def _real_extract(self, url):
         mobj = self._match_valid_url(url)
         display_id = mobj.group('id')
@@ -312,104 +261,102 @@ class EuroParlWebstreamIE(InfoExtractor):
         webpage = self._download_webpage(url, display_id)
         title = self._extract_title_from_webpage(webpage)
         
-        # Check if this is likely to be a live stream
-        is_live_page = self._check_is_live(webpage)
-        
         # First, look for m3u8 URLs directly in the page
         direct_urls = self._find_m3u8_in_webpage(webpage)
         if direct_urls:
             self.to_screen(f"Found {len(direct_urls)} potential stream URLs in webpage")
             for m3u8_url in direct_urls:
-                formats, subtitles = self._try_url(m3u8_url, display_id)
-                if formats:
-                    return {
-                        'id': display_id,
-                        'display_id': display_id,
-                        'title': title,
-                        'formats': formats,
-                        'subtitles': subtitles,
-                        'is_live': is_live_page,
-                    }
+                try:
+                    self.to_screen(f"Trying direct URL: {m3u8_url}")
+                    formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+                        m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                    
+                    if formats:
+                        return {
+                            'id': display_id,
+                            'display_id': display_id,
+                            'title': title,
+                            'formats': formats,
+                            'subtitles': subtitles,
+                        }
+                except ExtractorError as e:
+                    self.report_warning(f"Failed with direct URL {m3u8_url}: {e}")
         
-        # Parse the meeting ID and check if this is potentially a live stream
+        # If no direct URLs found, parse the meeting ID and generate likely timestamps
         meeting_info = self._parse_meeting_id(display_id)
         start_timestamp = meeting_info.get('start_timestamp')
         end_timestamp = meeting_info.get('end_timestamp')
-        is_live_candidate = meeting_info.get('is_live_candidate', False) or is_live_page
         
         self.to_screen(f"Generated timestamps for meeting: start={start_timestamp}, end={end_timestamp}")
-        self.to_screen(f"Stream is likely {'live' if is_live_candidate else 'archived'}")
         
-        # First check for live streams if this is a live candidate
-        if is_live_candidate:
-            self.to_screen("Checking for live stream URLs first")
-            
-            for endpoint in self._ENDPOINTS[:2]:  # Only try the first two endpoints for live
-                for channel in self._CHANNELS[:3]:  # Only try the top 3 channels for live
-                    for stream_type in self._LIVE_STREAM_IDS:
-                        # For live streams, try URLs without timestamps
-                        live_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8"
-                        formats, subtitles = self._try_url(live_url, display_id)
-                        
-                        if formats:
-                            return {
-                                'id': display_id,
-                                'display_id': display_id,
-                                'title': title,
-                                'formats': formats,
-                                'subtitles': subtitles,
-                                'is_live': True,
-                            }
+        # Try a variety of possibilities, starting with the most likely combinations
+        formats = []
+        subtitles = {}
+        working_url = None
         
-        # Try archived streams with prioritized channels
-        for channel in self._CHANNELS:
-            for stream_type in self._ARCHIVE_STREAM_IDS:
-                # For archived content, include timestamps
-                archive_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-                formats, subtitles = self._try_url(archive_url, display_id)
+        # Main endpoint with prioritized channels
+        for channel in self.PRIORITIZED_CHANNELS:
+            for stream_type in self.KNOWN_STREAM_IDS:
+                candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+                self.to_screen(f"Trying URL: {candidate_url}")
                 
-                if formats:
-                    return {
-                        'id': display_id,
-                        'display_id': display_id,
-                        'title': title,
-                        'formats': formats,
-                        'subtitles': subtitles,
-                        'is_live': False,
-                    }
-        
-        # If main endpoint + prioritized channels didn't work, try other endpoints
-        for endpoint in self._ENDPOINTS[1:]:
-            for channel in self._CHANNELS[:3]:  # Only try the top 3 channels for other endpoints
-                for stream_type in self._ARCHIVE_STREAM_IDS:
-                    archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-                    formats, subtitles = self._try_url(archive_url, display_id)
+                try:
+                    fmt, subs = self._extract_m3u8_formats_and_subtitles(
+                        candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
                     
-                    if formats:
+                    if fmt:
+                        formats.extend(fmt)
+                        self._merge_subtitles(subs, target=subtitles)
+                        working_url = candidate_url
+                        self.to_screen(f"Success! Found working URL: {working_url}")
+                        
                         return {
                             'id': display_id,
                             'display_id': display_id,
                             'title': title,
                             'formats': formats,
                             'subtitles': subtitles,
-                            'is_live': False,
                         }
+                except ExtractorError as e:
+                    self.report_warning(f"Failed with URL {candidate_url}: {e}")
+        
+        # If main endpoint + prioritized channels didn't work, try other endpoints
+        for endpoint in self.KNOWN_ENDPOINTS[1:]:  # Skip the first one as we already tried it
+            for channel in self.PRIORITIZED_CHANNELS[:3]:  # Only try the top 3 channels for other endpoints
+                for stream_type in self.KNOWN_STREAM_IDS:
+                    candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+                    self.to_screen(f"Trying URL: {candidate_url}")
+                    
+                    try:
+                        fmt, subs = self._extract_m3u8_formats_and_subtitles(
+                            candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                        
+                        if fmt:
+                            formats.extend(fmt)
+                            self._merge_subtitles(subs, target=subtitles)
+                            working_url = candidate_url
+                            self.to_screen(f"Success! Found working URL: {working_url}")
+                            
+                            return {
+                                'id': display_id,
+                                'display_id': display_id,
+                                'title': title,
+                                'formats': formats,
+                                'subtitles': subtitles,
+                            }
+                    except ExtractorError as e:
+                        self.report_warning(f"Failed with URL {candidate_url}: {e}")
         
         # If we've reached here, we need to give a helpful error message
         parsed_date = f"{meeting_info.get('date', 'unknown-date')}"
         parsed_time = f"{meeting_info.get('time', 'unknown-time')}"
         
-        # Provide different suggestions based on whether it's likely live or archived
-        if is_live_candidate:
-            suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8"
-            suggestion_text = f"For live streams, try: yt-dlp \"{suggested_url}\""
-        else:
-            suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-            suggestion_text = f"For archived content, try: yt-dlp \"{suggested_url}\""
+        # Provide the most likely URL for manual use
+        suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
         
         raise ExtractorError(
             f"Could not extract stream URL for {display_id}. The European Parliament stream may not be available.\n"
-            f"Attempted to find a {'live' if is_live_candidate else 'archived'} stream for date: {parsed_date}, time: {parsed_time}.\n"
-            f"{suggestion_text}",
+            f"Attempted to find a stream for date: {parsed_date}, time: {parsed_time}.\n"
+            f"Try using yt-dlp directly with: yt-dlp \"{suggested_url}\"",
             expected=True
         )

From 43ba015d276edaa80168a05b247b2e33e2954302 Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Wed, 26 Mar 2025 15:54:51 +0000
Subject: [PATCH 3/9] Update europa.py

---
 yt_dlp/extractor/europa.py | 310 ++++++++++++++++++-------------------
 1 file changed, 155 insertions(+), 155 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index c3a03bf591..8dd0ddcd88 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -103,7 +103,7 @@ class EuroParlWebstreamIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         https?://(?:
             multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)|
-            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>\d+)/(?:channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w-]+)(?:\.m3u8|/master\.m3u8)
+            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>\d+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w-]+)(?:\.m3u8|/master\.m3u8)
         )
     '''
     _TESTS = [{
@@ -118,7 +118,7 @@ class EuroParlWebstreamIE(InfoExtractor):
             'skip_download': True,
         },
     }, {
-        # New URL format for direct HLS streams
+        # Direct HLS stream URL
         'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime=1742828675&endTime=1742832870',
         'info_dict': {
             'id': 'index-archive',
@@ -128,187 +128,144 @@ class EuroParlWebstreamIE(InfoExtractor):
         'params': {
             'skip_download': True,
         },
-    }, {
-        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/special-committee-on-housing-crisis-in-european-union-ordinary-meeting_20250324-1500-COMMITTEE-HOUS',
-        'info_dict': {
-            'id': '20250324-1500-COMMITTEE-HOUS',
-            'display_id': '20250324-1500-COMMITTEE-HOUS',
-            'ext': 'mp4',
-            'title': 'Special committee on the Housing Crisis in the European Union Ordinary meeting',
-        },
-        'params': {
-            'skip_download': True,
-        },
     }]
 
-    # Known working stream IDs (in order of likely success)
-    KNOWN_STREAM_IDS = [
-        "index-archive",
-        "norsk-archive",
-    ]
+    # Main CDN endpoint - primarily target this instead of trying multiple
+    MAIN_ENDPOINT = "2113753"
+    
+    # Priority channels based on observed success rates
+    PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-bxl", "channel-10-bxl"]
+    
+    # Default stream types by content type
+    LIVE_STREAM_TYPES = ["index", "master", "playlist"]
+    ARCHIVE_STREAM_TYPES = ["index-archive", "norsk-archive", "index", "master"]
 
-    # Known CDN endpoints (in order of likely success)
-    KNOWN_ENDPOINTS = [
-        "2113753",  # This appears to be the main endpoint
-        "2113749",
-        "2113750",
-        "2113751",
-        "2113752",
-        "2113754",
-    ]
+    def _extract_direct_url_from_webpage(self, webpage):
+        """Extract direct m3u8 URLs from webpage with minimal logging"""
+        m3u8_urls = []
+        
+        # Search patterns for m3u8 URLs
+        for pattern in [
+            r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\']*)?)["\']',
+            r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
+            r'=[^\n]*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']',
+        ]:
+            matches = re.findall(pattern, webpage)
+            if matches:
+                m3u8_urls.extend(matches)
+        
+        # Clean up URLs
+        clean_urls = []
+        for url in m3u8_urls:
+            # Remove any JS string escaping
+            url = url.replace('\\/', '/').replace('\\\\', '\\')
+            clean_urls.append(url)
+            
+        # Extract from network panel if available
+        network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE)
+        if network_url_match:
+            clean_urls.append(network_url_match.group(1))
+            
+        return clean_urls
 
-    # Prioritized channel list based on observations (channel-07-bxl is often used)
-    PRIORITIZED_CHANNELS = [
-        "channel-07-bxl",  # Most common based on examples
-        "channel-03-bxl",  # Also seen in examples
-        "channel-01-bxl",
-        "channel-02-bxl",
-        "channel-04-bxl",
-        "channel-05-bxl",
-        "channel-06-bxl",
-        "channel-08-bxl",
-        "channel-09-bxl",
-        "channel-10-bxl",
-    ]
+    def _extract_title_from_webpage(self, webpage, display_id):
+        """Extract the title from the webpage"""
+        # Try different patterns to extract the title
+        for pattern in [
+            r'<meta property="og:title" content="([^"]+)"',
+            r'<title>([^<]+)</title>',
+            r'<h1[^>]*>([^<]+)</h1>',
+            r'"title"\s*:\s*"([^"]+)"',
+        ]:
+            title_match = re.search(pattern, webpage)
+            if title_match:
+                title = title_match.group(1).strip()
+                # Clean up common suffixes
+                title = re.sub(r'\s*\|\s*European Parliament$', '', title)
+                title = re.sub(r'\s*-\s*Multimedia Centre$', '', title)
+                return title
+                
+        return f"European Parliament Session - {display_id}"
 
-    def _parse_meeting_id(self, display_id):
-        """Extract date and time information from the meeting ID."""
-        # Format: YYYYMMDD-HHMM-COMMITTEE-TYPE
+    def _parse_meeting_date(self, display_id):
+        """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)"""
         date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
         if date_match:
             date_str, time_str, meeting_type = date_match.groups()
             try:
-                # Parse the date and time
+                # Parse the date components
                 year = int(date_str[:4])
                 month = int(date_str[4:6])
                 day = int(date_str[6:8])
                 hour = int(time_str[:2])
                 minute = int(time_str[2:4])
                 
-                # Create datetime object
+                # Create timestamps with a generous window (3 hours before and after)
                 meeting_dt = datetime.datetime(year, month, day, hour, minute)
+                start_dt = meeting_dt - datetime.timedelta(hours=3)
+                end_dt = meeting_dt + datetime.timedelta(hours=6)
                 
-                # Calculate a reasonable meeting duration (2 hours by default)
-                end_dt = meeting_dt + datetime.timedelta(hours=2)
+                # Convert to timestamps
+                start_ts = int(start_dt.timestamp())
+                end_ts = int(end_dt.timestamp())
                 
-                return {
-                    'date': date_str,
-                    'time': time_str,
-                    'type': meeting_type,
-                    'start_dt': meeting_dt,
-                    'end_dt': end_dt,
-                    'start_timestamp': int(meeting_dt.timestamp()),
-                    'end_timestamp': int(end_dt.timestamp()),
-                }
-            except (ValueError, OverflowError) as e:
-                self.report_warning(f"Failed to parse meeting date/time: {e}")
-        
-        # If we can't parse the date/time, use the current time minus 24 hours to now
-        current_time = int(time.time())
-        return {
-            'start_timestamp': current_time - 86400,  # 24 hours ago
-            'end_timestamp': current_time,
-        }
-
-    def _find_m3u8_in_webpage(self, webpage):
-        """Look for m3u8 URLs directly in the webpage."""
-        # Look for direct m3u8 URLs with timestamps
-        m3u8_matches = re.findall(
-            r'[\'"]((https?://live\.media\.eup\.glcloud\.eu/[^"\']+\.m3u8(?:\?[^\'"]*)?)[\'"])',
-            webpage
-        )
-        if m3u8_matches:
-            return [url[0].replace('\\/', '/').replace('\\\\', '\\') for url in m3u8_matches]
-        
-        return []
-
-    def _extract_title_from_webpage(self, webpage):
-        """Extract the title from the webpage."""
-        title = self._html_search_regex(
-            r'<meta property="og:title" content="([^"]+)"',
-            webpage, 'title', default=None) or \
-            self._html_search_regex(
-            r'<title>([^<]+)</title>',
-            webpage, 'title', default='European Parliament Stream')
+                return start_ts, end_ts
+                
+            except (ValueError, OverflowError):
+                pass
         
-        # Clean up title
-        title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
-        return title
+        # Fallback to a recent 48-hour window
+        now = int(time.time())
+        start_time = now - (48 * 3600)  # 48 hours ago
+        return start_time, now
 
     def _real_extract(self, url):
         mobj = self._match_valid_url(url)
         display_id = mobj.group('id')
         live_id = mobj.group('live_id')
         stream_id = mobj.group('stream_id')
+        channel = mobj.group('channel')
 
-        # Handle direct HLS stream URLs
+        # Handle direct HLS URLs
         if live_id and stream_id:
-            # Strip any query parameters from stream_id
-            if '?' in stream_id:
-                stream_id = stream_id.split('?')[0]
+            # Remove query parameters from stream_id if present
+            clean_stream_id = stream_id.split('?')[0] if '?' in stream_id else stream_id
             
             formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                url, stream_id, 'mp4', m3u8_id='hls', fatal=False)
-
+                url, clean_stream_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True)
+            
             return {
-                'id': stream_id,
+                'id': clean_stream_id,
                 'title': 'European Parliament Stream',
                 'formats': formats,
                 'subtitles': subtitles,
             }
 
-        # If we're dealing with a europarl.europa.eu URL, download the webpage first
+        # Download the webpage for standard europarl URLs
         webpage = self._download_webpage(url, display_id)
-        title = self._extract_title_from_webpage(webpage)
         
-        # First, look for m3u8 URLs directly in the page
-        direct_urls = self._find_m3u8_in_webpage(webpage)
-        if direct_urls:
-            self.to_screen(f"Found {len(direct_urls)} potential stream URLs in webpage")
-            for m3u8_url in direct_urls:
-                try:
-                    self.to_screen(f"Trying direct URL: {m3u8_url}")
-                    formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                        m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
-                    
-                    if formats:
-                        return {
-                            'id': display_id,
-                            'display_id': display_id,
-                            'title': title,
-                            'formats': formats,
-                            'subtitles': subtitles,
-                        }
-                except ExtractorError as e:
-                    self.report_warning(f"Failed with direct URL {m3u8_url}: {e}")
+        # Check for live indicators
+        is_live = bool(re.search(r'(?:isLive|livestream|live-stream|\"live\"\s*:\s*true)', webpage, re.IGNORECASE))
         
-        # If no direct URLs found, parse the meeting ID and generate likely timestamps
-        meeting_info = self._parse_meeting_id(display_id)
-        start_timestamp = meeting_info.get('start_timestamp')
-        end_timestamp = meeting_info.get('end_timestamp')
+        # Extract title
+        title = self._extract_title_from_webpage(webpage, display_id)
         
-        self.to_screen(f"Generated timestamps for meeting: start={start_timestamp}, end={end_timestamp}")
+        # First try direct URLs from the webpage (this is the most reliable approach)
+        direct_urls = self._extract_direct_url_from_webpage(webpage)
         
-        # Try a variety of possibilities, starting with the most likely combinations
+        # Track whether we successfully found a stream
         formats = []
         subtitles = {}
-        working_url = None
         
-        # Main endpoint with prioritized channels
-        for channel in self.PRIORITIZED_CHANNELS:
-            for stream_type in self.KNOWN_STREAM_IDS:
-                candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-                self.to_screen(f"Trying URL: {candidate_url}")
-                
+        if direct_urls:
+            for m3u8_url in direct_urls:
                 try:
                     fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                        candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                        m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
                     
                     if fmt:
                         formats.extend(fmt)
                         self._merge_subtitles(subs, target=subtitles)
-                        working_url = candidate_url
-                        self.to_screen(f"Success! Found working URL: {working_url}")
                         
                         return {
                             'id': display_id,
@@ -316,26 +273,58 @@ class EuroParlWebstreamIE(InfoExtractor):
                             'title': title,
                             'formats': formats,
                             'subtitles': subtitles,
+                            'is_live': is_live,
                         }
-                except ExtractorError as e:
-                    self.report_warning(f"Failed with URL {candidate_url}: {e}")
+                except ExtractorError:
+                    pass
+        
+        # Parse timestamps for archive retrieval (or use current time for live)
+        if is_live:
+            # For live streams, we don't need timestamps
+            start_timestamp, end_timestamp = None, None
+        else:
+            start_timestamp, end_timestamp = self._parse_meeting_date(display_id)
         
-        # If main endpoint + prioritized channels didn't work, try other endpoints
-        for endpoint in self.KNOWN_ENDPOINTS[1:]:  # Skip the first one as we already tried it
-            for channel in self.PRIORITIZED_CHANNELS[:3]:  # Only try the top 3 channels for other endpoints
-                for stream_type in self.KNOWN_STREAM_IDS:
-                    candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-                    self.to_screen(f"Trying URL: {candidate_url}")
+        # Use appropriate stream types for the content type
+        stream_types = self.LIVE_STREAM_TYPES if is_live else self.ARCHIVE_STREAM_TYPES
+        
+        # Try combinations with improved targeting
+        for channel in self.PRIORITY_CHANNELS:
+            for stream_type in stream_types:
+                # For live streams, try without timestamps first
+                if is_live:
+                    live_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8"
+                    
+                    try:
+                        fmt, subs = self._extract_m3u8_formats_and_subtitles(
+                            live_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                        
+                        if fmt:
+                            formats.extend(fmt)
+                            self._merge_subtitles(subs, target=subtitles)
+                            
+                            return {
+                                'id': display_id,
+                                'display_id': display_id,
+                                'title': title,
+                                'formats': formats,
+                                'subtitles': subtitles,
+                                'is_live': True,
+                            }
+                    except ExtractorError:
+                        pass
+                
+                # For archived content (or as fallback for live), try with timestamps
+                if start_timestamp and end_timestamp:
+                    archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
                     
                     try:
                         fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                            candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                            archive_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
                         
                         if fmt:
                             formats.extend(fmt)
                             self._merge_subtitles(subs, target=subtitles)
-                            working_url = candidate_url
-                            self.to_screen(f"Success! Found working URL: {working_url}")
                             
                             return {
                                 'id': display_id,
@@ -343,20 +332,31 @@ class EuroParlWebstreamIE(InfoExtractor):
                                 'title': title,
                                 'formats': formats,
                                 'subtitles': subtitles,
+                                'is_live': False,
                             }
-                    except ExtractorError as e:
-                        self.report_warning(f"Failed with URL {candidate_url}: {e}")
+                    except ExtractorError:
+                        pass
+        
+        # Provide helpful error with the most likely working URLs
+        suggested_urls = []
         
-        # If we've reached here, we need to give a helpful error message
-        parsed_date = f"{meeting_info.get('date', 'unknown-date')}"
-        parsed_time = f"{meeting_info.get('time', 'unknown-time')}"
+        # Add the URLs that are most likely to work based on the logs and screenshots
+        if start_timestamp and end_timestamp:
+            suggested_urls.extend([
+                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}",
+                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+            ])
+        else:
+            suggested_urls.extend([
+                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index.m3u8",
+                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index.m3u8"
+            ])
         
-        # Provide the most likely URL for manual use
-        suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+        suggestions = "\n".join([f"yt-dlp \"{url}\"" for url in suggested_urls])
         
         raise ExtractorError(
-            f"Could not extract stream URL for {display_id}. The European Parliament stream may not be available.\n"
-            f"Attempted to find a stream for date: {parsed_date}, time: {parsed_time}.\n"
-            f"Try using yt-dlp directly with: yt-dlp \"{suggested_url}\"",
+            f"Could not extract stream URL for {display_id or url}. The European Parliament stream may not be available.\n"
+            f"Live stream detected: {is_live}\n"
+            f"Try using yt-dlp directly with one of these URLs:\n{suggestions}",
             expected=True
         )

From fe08c6ca27701c8199300f48c9320fd4f20584fb Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Fri, 28 Mar 2025 12:49:15 +0000
Subject: [PATCH 4/9] Update europa.py

---
 yt_dlp/extractor/europa.py | 351 ++++++++++++++++++++++---------------
 1 file changed, 205 insertions(+), 146 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index 8dd0ddcd88..b40d393a79 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
@@ -103,7 +104,7 @@ class EuroParlWebstreamIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         https?://(?:
             multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)|
-            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>\d+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w-]+)(?:\.m3u8|/master\.m3u8)
+            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>[\w-]+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows
         )
     '''
     _TESTS = [{
@@ -118,10 +119,21 @@ class EuroParlWebstreamIE(InfoExtractor):
             'skip_download': True,
         },
     }, {
-        # Direct HLS stream URL
-        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime=1742828675&endTime=1742832870',
+        # Direct HLS stream URL (archive example similar to user provided)
+        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442',
         'info_dict': {
-            'id': 'index-archive',
+            'id': 'norsk-archive', # ID derived from filename before query
+            'ext': 'mp4',
+            'title': 'European Parliament Stream',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    },{
+        # Direct HLS stream URL (live example)
+        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8',
+        'info_dict': {
+            'id': 'index',
             'ext': 'mp4',
             'title': 'European Parliament Stream',
         },
@@ -130,43 +142,53 @@ class EuroParlWebstreamIE(InfoExtractor):
         },
     }]
 
-    # Main CDN endpoint - primarily target this instead of trying multiple
-    MAIN_ENDPOINT = "2113753"
-    
-    # Priority channels based on observed success rates
-    PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-bxl", "channel-10-bxl"]
-    
-    # Default stream types by content type
-    LIVE_STREAM_TYPES = ["index", "master", "playlist"]
-    ARCHIVE_STREAM_TYPES = ["index-archive", "norsk-archive", "index", "master"]
+    # Known CDN endpoints - try these if direct extraction fails
+    # Added 2113713 and 2113713-b based on user's M3U8
+    ENDPOINTS = ["2113753", "2113713", "2113713-b"]
+
+    # Priority channels based on observed success rates & user M3U8
+    # Added channel-01-stb
+    PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"]
+
+    # Default stream types/filenames by content type
+    # These are used in the *fallback* guessing logic.
+    # The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed.
+    LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"]
+    ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"]
 
     def _extract_direct_url_from_webpage(self, webpage):
         """Extract direct m3u8 URLs from webpage with minimal logging"""
-        m3u8_urls = []
-        
+        m3u8_urls = set() # Use a set to avoid duplicates
+
         # Search patterns for m3u8 URLs
+        # Added more flexibility for quotes and paths
         for pattern in [
-            r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\']*)?)["\']',
+            r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']',
             r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
-            r'=[^\n]*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']',
+            # Look for assignments or attributes
+            r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']',
+            # Look for URLs within JSON-like structures in script tags
+            r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
+            r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
         ]:
             matches = re.findall(pattern, webpage)
-            if matches:
-                m3u8_urls.extend(matches)
-        
-        # Clean up URLs
-        clean_urls = []
-        for url in m3u8_urls:
-            # Remove any JS string escaping
-            url = url.replace('\\/', '/').replace('\\\\', '\\')
-            clean_urls.append(url)
-            
-        # Extract from network panel if available
+            for match in matches:
+                # Handle potential tuple results from findall if multiple groups exist in regex
+                url_match = match if isinstance(match, str) else match[0]
+                # Basic sanity check
+                if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match:
+                    # Remove any JS string escaping
+                    url_match = url_match.replace('\\/', '/').replace('\\\\', '\\')
+                    m3u8_urls.add(url_match)
+
+        # Extract from network panel if available (less reliable parsing)
         network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE)
         if network_url_match:
-            clean_urls.append(network_url_match.group(1))
-            
-        return clean_urls
+            url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\')
+            m3u8_urls.add(url_match)
+
+        self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage')
+        return list(m3u8_urls)
 
     def _extract_title_from_webpage(self, webpage, display_id):
         """Extract the title from the webpage"""
@@ -174,6 +196,7 @@ class EuroParlWebstreamIE(InfoExtractor):
         for pattern in [
             r'<meta property="og:title" content="([^"]+)"',
             r'<title>([^<]+)</title>',
+            r'<h1[^>]*class="erpl_title-h1"[^>]*>([^<]+)</h1>', # Specific title class
             r'<h1[^>]*>([^<]+)</h1>',
             r'"title"\s*:\s*"([^"]+)"',
         ]:
@@ -181,17 +204,18 @@ class EuroParlWebstreamIE(InfoExtractor):
             if title_match:
                 title = title_match.group(1).strip()
                 # Clean up common suffixes
-                title = re.sub(r'\s*\|\s*European Parliament$', '', title)
-                title = re.sub(r'\s*-\s*Multimedia Centre$', '', title)
-                return title
-                
-        return f"European Parliament Session - {display_id}"
+                title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
+                title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip()
+                if title:
+                    return title
+
+        return f"European Parliament Session - {display_id}" # Fallback title
 
     def _parse_meeting_date(self, display_id):
         """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)"""
         date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
         if date_match:
-            date_str, time_str, meeting_type = date_match.groups()
+            date_str, time_str, _ = date_match.groups()
             try:
                 # Parse the date components
                 year = int(date_str[:4])
@@ -199,164 +223,199 @@ class EuroParlWebstreamIE(InfoExtractor):
                 day = int(date_str[6:8])
                 hour = int(time_str[:2])
                 minute = int(time_str[2:4])
-                
-                # Create timestamps with a generous window (3 hours before and after)
-                meeting_dt = datetime.datetime(year, month, day, hour, minute)
+
+                # Create timestamps with a generous window (e.g., 3 hours before, 6 hours after)
+                # This helps catch streams that start slightly early or run long
+                meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC
                 start_dt = meeting_dt - datetime.timedelta(hours=3)
-                end_dt = meeting_dt + datetime.timedelta(hours=6)
-                
-                # Convert to timestamps
+                end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window
+
+                # Convert to Unix timestamps
                 start_ts = int(start_dt.timestamp())
                 end_ts = int(end_dt.timestamp())
-                
+
+                self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}')
                 return start_ts, end_ts
-                
-            except (ValueError, OverflowError):
-                pass
-        
-        # Fallback to a recent 48-hour window
+
+            except (ValueError, OverflowError) as e:
+                 self.to_screen(f'Error parsing date from display_id "{display_id}": {e}')
+                 pass # Fall through to fallback
+
+        # Fallback to a recent window if parsing fails or ID format is different
+        self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.')
         now = int(time.time())
-        start_time = now - (48 * 3600)  # 48 hours ago
-        return start_time, now
+        start_time = now - (24 * 3600)  # 24 hours ago (might be too short for older archives)
+        end_time = now + (1 * 3600)      # 1 hour in the future (for live/recent)
+        return start_time, end_time
 
     def _real_extract(self, url):
         mobj = self._match_valid_url(url)
+        # Get potential IDs from the regex match groups
         display_id = mobj.group('id')
         live_id = mobj.group('live_id')
         stream_id = mobj.group('stream_id')
         channel = mobj.group('channel')
 
-        # Handle direct HLS URLs
-        if live_id and stream_id:
-            # Remove query parameters from stream_id if present
-            clean_stream_id = stream_id.split('?')[0] if '?' in stream_id else stream_id
-            
+        # Use the most specific ID available
+        video_id = display_id or stream_id or live_id or channel
+
+        # Handle direct HLS URLs first (most reliable if provided)
+        if live_id and (stream_id or channel):
+            # Clean up stream_id (remove query parameters for use as info dict id)
+            clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id
+            # If stream_id is missing but channel exists, use channel as part of the id
+            final_id = clean_stream_id or channel or 'unknown_stream'
+            # Remove potential .m3u8 suffix for cleaner ID
+            if final_id.endswith('.m3u8'):
+                 final_id = final_id[:-5]
+
+            self.to_screen(f'Processing direct HLS URL: {url}')
             formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                url, clean_stream_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True)
-            
+                url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues
+
+            if not formats:
+                 self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}')
+                 # Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work
+                 # raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True)
+
             return {
-                'id': clean_stream_id,
-                'title': 'European Parliament Stream',
-                'formats': formats,
-                'subtitles': subtitles,
+                'id': final_id,
+                'title': 'European Parliament Stream', # Generic title for direct URLs
+                'formats': formats or [],
+                'subtitles': subtitles or {},
+                'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL
             }
 
-        # Download the webpage for standard europarl URLs
+        # --- Fallback for multimedia.europarl.europa.eu URLs ---
+        if not display_id: # Should have display_id if it's not a direct HLS URL
+             raise ExtractorError('Failed to identify video ID from URL.')
+
+        self.to_screen(f'Processing webpage URL: {url}')
         webpage = self._download_webpage(url, display_id)
-        
-        # Check for live indicators
-        is_live = bool(re.search(r'(?:isLive|livestream|live-stream|\"live\"\s*:\s*true)', webpage, re.IGNORECASE))
-        
+
+        # Check for live indicators more reliably
+        # Look for common live indicators in JS, classes, or text
+        is_live = bool(re.search(
+            r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)',
+            webpage,
+            re.IGNORECASE))
+        self.to_screen(f'Detected as live: {is_live}')
+
         # Extract title
         title = self._extract_title_from_webpage(webpage, display_id)
-        
-        # First try direct URLs from the webpage (this is the most reliable approach)
+
+        # *** Strategy 1: Extract direct URLs from webpage (Preferred) ***
         direct_urls = self._extract_direct_url_from_webpage(webpage)
-        
-        # Track whether we successfully found a stream
         formats = []
         subtitles = {}
-        
+
         if direct_urls:
+            self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...')
             for m3u8_url in direct_urls:
+                # Clean stream ID from URL for format identification
+                m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0]
+                if m3u8_stream_id.endswith('.m3u8'):
+                    m3u8_stream_id = m3u8_stream_id[:-5]
+
                 try:
                     fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                        m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
-                    
+                        m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error
+
                     if fmt:
+                        self.to_screen(f'Successfully extracted formats from: {m3u8_url}')
                         formats.extend(fmt)
                         self._merge_subtitles(subs, target=subtitles)
-                        
+                        # If we found formats, we are likely done, return immediately
                         return {
                             'id': display_id,
                             'display_id': display_id,
                             'title': title,
                             'formats': formats,
                             'subtitles': subtitles,
-                            'is_live': is_live,
+                            'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL
                         }
-                except ExtractorError:
-                    pass
-        
-        # Parse timestamps for archive retrieval (or use current time for live)
-        if is_live:
-            # For live streams, we don't need timestamps
-            start_timestamp, end_timestamp = None, None
+                    else:
+                        self.to_screen(f'No formats found in: {m3u8_url}')
+                except ExtractorError as e:
+                    self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}')
+                    pass # Try the next direct URL
         else:
-            start_timestamp, end_timestamp = self._parse_meeting_date(display_id)
-        
-        # Use appropriate stream types for the content type
-        stream_types = self.LIVE_STREAM_TYPES if is_live else self.ARCHIVE_STREAM_TYPES
-        
-        # Try combinations with improved targeting
-        for channel in self.PRIORITY_CHANNELS:
-            for stream_type in stream_types:
-                # For live streams, try without timestamps first
-                if is_live:
-                    live_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8"
-                    
-                    try:
-                        fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                            live_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
-                        
-                        if fmt:
-                            formats.extend(fmt)
-                            self._merge_subtitles(subs, target=subtitles)
-                            
-                            return {
-                                'id': display_id,
-                                'display_id': display_id,
-                                'title': title,
-                                'formats': formats,
-                                'subtitles': subtitles,
-                                'is_live': True,
-                            }
-                    except ExtractorError:
-                        pass
-                
-                # For archived content (or as fallback for live), try with timestamps
-                if start_timestamp and end_timestamp:
-                    archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-                    
+            self.to_screen('No direct M3U8 URLs found in webpage.')
+
+
+        # *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) ***
+        self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...')
+
+        # Parse timestamps for archive retrieval (or use a window for live/unknown)
+        # Always parse, even if live, as it might be a recently finished live event
+        start_timestamp, end_timestamp = self._parse_meeting_date(display_id)
+
+        # Use appropriate stream filenames for the content type
+        stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES
+
+        # Try combinations with updated endpoints and channels
+        for endpoint in self.ENDPOINTS:
+            for channel_to_try in self.PRIORITY_CHANNELS:
+                for filename in stream_filenames:
+                    base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}"
+
+                    # Determine if timestamps should be added
+                    # Add timestamps if it's explicitly not live, OR if the filename suggests archive,
+                    # OR if start/end timestamps were successfully parsed from the ID.
+                    # Avoid timestamps for clearly live filenames unless forced by non-live status.
+                    use_timestamps = (
+                        (not is_live or 'archive' in filename.lower())
+                        and start_timestamp and end_timestamp
+                    )
+
+                    test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url
+
                     try:
+                        self.to_screen(f'Trying guessed URL: {test_url}')
                         fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                            archive_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
-                        
+                            test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False)
+
                         if fmt:
+                            self.to_screen(f'Success with guessed URL: {test_url}')
                             formats.extend(fmt)
                             self._merge_subtitles(subs, target=subtitles)
-                            
+                            # Found a working combination
                             return {
                                 'id': display_id,
                                 'display_id': display_id,
                                 'title': title,
                                 'formats': formats,
                                 'subtitles': subtitles,
-                                'is_live': False,
+                                'is_live': not use_timestamps, # If we used timestamps, assume not live
                             }
-                    except ExtractorError:
-                        pass
-        
-        # Provide helpful error with the most likely working URLs
-        suggested_urls = []
-        
-        # Add the URLs that are most likely to work based on the logs and screenshots
+                        else:
+                            self.to_screen(f'No formats found in guessed URL: {test_url}')
+
+                    except ExtractorError as e:
+                        # Log error lightly, as many guesses are expected to fail
+                        self.to_screen(f'Guessed URL failed: {test_url} ({e})')
+                        pass # Continue trying other combinations
+
+        # *** If all strategies fail ***
+        self.to_screen('All extraction strategies failed.')
+
+        # Provide helpful error with suggestions
+        error_message = (
+            f"Could not extract stream URL for {display_id or url}. "
+            "The stream may be old, expired, or use an unsupported format.\n"
+            f"Live status detected: {is_live}\n"
+            "Common issues:\n"
+            "- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n"
+            "- The event might not be available via the standard CDN endpoints/channels.\n"
+            "If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n"
+            "Example (using parsed times, adjust if needed):\n"
+        )
         if start_timestamp and end_timestamp:
-            suggested_urls.extend([
-                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}",
-                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-            ])
+             example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+             error_message += f'yt-dlp "{example_url}"'
         else:
-            suggested_urls.extend([
-                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index.m3u8",
-                f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index.m3u8"
-            ])
-        
-        suggestions = "\n".join([f"yt-dlp \"{url}\"" for url in suggested_urls])
-        
-        raise ExtractorError(
-            f"Could not extract stream URL for {display_id or url}. The European Parliament stream may not be available.\n"
-            f"Live stream detected: {is_live}\n"
-            f"Try using yt-dlp directly with one of these URLs:\n{suggestions}",
-            expected=True
-        )
+             example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8"
+             error_message += f'yt-dlp "{example_url}"'
+
+
+        raise ExtractorError(error_message, expected=True)

From 6e3ddbbe4dacd39bd9ee0c13be457c90a37687aa Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Fri, 28 Mar 2025 13:08:14 +0000
Subject: [PATCH 5/9] Update europa.py

---
 yt_dlp/extractor/europa.py | 410 ++++++++++---------------------------
 1 file changed, 107 insertions(+), 303 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index b40d393a79..cd4cbf4dfd 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -1,4 +1,3 @@
-# coding: utf-8
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
@@ -10,15 +9,7 @@ from ..utils import (
     traverse_obj,
     unified_strdate,
     xpath_text,
-    ExtractorError,
-    js_to_json,
-    urljoin
 )
-import re
-import json
-import time
-import datetime
-
 
 class EuropaIE(InfoExtractor):
     _WORKING = False
@@ -54,7 +45,10 @@ class EuropaIE(InfoExtractor):
         def get_item(type_, preference):
             items = {}
             for item in playlist.findall(f'./info/{type_}/item'):
-                lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
+                lang, label = (
+                    xpath_text(item, 'lg', default=None),
+                    xpath_text(item, 'label', default=None)
+                )
                 if lang and label:
                     items[lang] = label.strip()
             for p in preference:
@@ -63,7 +57,6 @@ class EuropaIE(InfoExtractor):
 
         query = parse_qs(url)
         preferred_lang = query.get('sitelang', ('en', ))[0]
-
         preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
 
         title = get_item('title', preferred_langs) or video_id
@@ -102,320 +95,131 @@ class EuropaIE(InfoExtractor):
 
 class EuroParlWebstreamIE(InfoExtractor):
     _VALID_URL = r'''(?x)
-        https?://(?:
-            multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)|
-            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>[\w-]+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows
-        )
+        https?://multimedia\.europarl\.europa\.eu/
+        (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)
     '''
     _TESTS = [{
         'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
         'info_dict': {
-            'id': '20220914-0900-PLENARY',
+            'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d',
             'display_id': '20220914-0900-PLENARY',
             'ext': 'mp4',
             'title': 'Plenary session',
+            'release_timestamp': 1663139069,
+            'release_date': '20220914',
         },
         'params': {
             'skip_download': True,
         },
     }, {
-        # Direct HLS stream URL (archive example similar to user provided)
-        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442',
+        # example of old live webstream
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA',
         'info_dict': {
-            'id': 'norsk-archive', # ID derived from filename before query
             'ext': 'mp4',
-            'title': 'European Parliament Stream',
-        },
-        'params': {
-            'skip_download': True,
-        },
-    },{
-        # Direct HLS stream URL (live example)
-        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8',
-        'info_dict': {
-            'id': 'index',
-            'ext': 'mp4',
-            'title': 'European Parliament Stream',
-        },
-        'params': {
-            'skip_download': True,
+            'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715',
+            'release_timestamp': 1668502800,
+            'title': 'Euroscola 2022-11-15 19:21',
+            'release_date': '20221115',
+            'live_status': 'is_live',
         },
+        'skip': 'not live anymore',
     }]
 
-    # Known CDN endpoints - try these if direct extraction fails
-    # Added 2113713 and 2113713-b based on user's M3U8
-    ENDPOINTS = ["2113753", "2113713", "2113713-b"]
-
-    # Priority channels based on observed success rates & user M3U8
-    # Added channel-01-stb
-    PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"]
-
-    # Default stream types/filenames by content type
-    # These are used in the *fallback* guessing logic.
-    # The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed.
-    LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"]
-    ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"]
-
-    def _extract_direct_url_from_webpage(self, webpage):
-        """Extract direct m3u8 URLs from webpage with minimal logging"""
-        m3u8_urls = set() # Use a set to avoid duplicates
-
-        # Search patterns for m3u8 URLs
-        # Added more flexibility for quotes and paths
-        for pattern in [
-            r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']',
-            r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
-            # Look for assignments or attributes
-            r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']',
-            # Look for URLs within JSON-like structures in script tags
-            r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
-            r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
-        ]:
-            matches = re.findall(pattern, webpage)
-            for match in matches:
-                # Handle potential tuple results from findall if multiple groups exist in regex
-                url_match = match if isinstance(match, str) else match[0]
-                # Basic sanity check
-                if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match:
-                    # Remove any JS string escaping
-                    url_match = url_match.replace('\\/', '/').replace('\\\\', '\\')
-                    m3u8_urls.add(url_match)
-
-        # Extract from network panel if available (less reliable parsing)
-        network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE)
-        if network_url_match:
-            url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\')
-            m3u8_urls.add(url_match)
-
-        self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage')
-        return list(m3u8_urls)
-
-    def _extract_title_from_webpage(self, webpage, display_id):
-        """Extract the title from the webpage"""
-        # Try different patterns to extract the title
-        for pattern in [
-            r'<meta property="og:title" content="([^"]+)"',
-            r'<title>([^<]+)</title>',
-            r'<h1[^>]*class="erpl_title-h1"[^>]*>([^<]+)</h1>', # Specific title class
-            r'<h1[^>]*>([^<]+)</h1>',
-            r'"title"\s*:\s*"([^"]+)"',
-        ]:
-            title_match = re.search(pattern, webpage)
-            if title_match:
-                title = title_match.group(1).strip()
-                # Clean up common suffixes
-                title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
-                title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip()
-                if title:
-                    return title
-
-        return f"European Parliament Session - {display_id}" # Fallback title
-
-    def _parse_meeting_date(self, display_id):
-        """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)"""
-        date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
-        if date_match:
-            date_str, time_str, _ = date_match.groups()
-            try:
-                # Parse the date components
-                year = int(date_str[:4])
-                month = int(date_str[4:6])
-                day = int(date_str[6:8])
-                hour = int(time_str[:2])
-                minute = int(time_str[2:4])
-
-                # Create timestamps with a generous window (e.g., 3 hours before, 6 hours after)
-                # This helps catch streams that start slightly early or run long
-                meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC
-                start_dt = meeting_dt - datetime.timedelta(hours=3)
-                end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window
-
-                # Convert to Unix timestamps
-                start_ts = int(start_dt.timestamp())
-                end_ts = int(end_dt.timestamp())
-
-                self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}')
-                return start_ts, end_ts
-
-            except (ValueError, OverflowError) as e:
-                 self.to_screen(f'Error parsing date from display_id "{display_id}": {e}')
-                 pass # Fall through to fallback
-
-        # Fallback to a recent window if parsing fails or ID format is different
-        self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.')
-        now = int(time.time())
-        start_time = now - (24 * 3600)  # 24 hours ago (might be too short for older archives)
-        end_time = now + (1 * 3600)      # 1 hour in the future (for live/recent)
-        return start_time, end_time
-
     def _real_extract(self, url):
-        mobj = self._match_valid_url(url)
-        # Get potential IDs from the regex match groups
-        display_id = mobj.group('id')
-        live_id = mobj.group('live_id')
-        stream_id = mobj.group('stream_id')
-        channel = mobj.group('channel')
-
-        # Use the most specific ID available
-        video_id = display_id or stream_id or live_id or channel
-
-        # Handle direct HLS URLs first (most reliable if provided)
-        if live_id and (stream_id or channel):
-            # Clean up stream_id (remove query parameters for use as info dict id)
-            clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id
-            # If stream_id is missing but channel exists, use channel as part of the id
-            final_id = clean_stream_id or channel or 'unknown_stream'
-            # Remove potential .m3u8 suffix for cleaner ID
-            if final_id.endswith('.m3u8'):
-                 final_id = final_id[:-5]
-
-            self.to_screen(f'Processing direct HLS URL: {url}')
-            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
 
-            if not formats:
-                 self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}')
-                 # Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work
-                 # raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True)
+        # Try to parse Next.js data for metadata
+        nextjs = self._search_nextjs_data(webpage, display_id, default={})
+        page_props = traverse_obj(nextjs, ('props', 'pageProps'), default={})
+        media_info = page_props.get('mediaItem') or {} # Look for start/end times here for archives?
+
+        title = media_info.get('title') or media_info.get('name') or display_id
+        release_timestamp = None
+        # Existing logic uses startDateTime, might need adjustment for archive start/end
+        if 'startDateTime' in media_info:
+             release_timestamp = parse_iso8601(media_info['startDateTime'])
+
+        # Determine if it's Live or VOD/Archive (might need refinement)
+        # mediaSubType might be 'Live' or 'VOD' or something else
+        is_live = media_info.get('mediaSubType') == 'Live'
+
+        # Search for any .m3u8 link first
+        m3u8_links = self._search_regex(
+            r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)',
+            webpage, 'm3u8 URL', default=None, group=1, fatal=False
+        )
 
+        # --- Potential modification area START ---
+        # If it's NOT live, and we have start/end times, and m3u8_links points to a live URL,
+        # try constructing the index-archive.m3u8 URL here.
+        # Example (conceptual - requires actual start/end times and base URL logic):
+        # if not is_live and media_info.get('startTime') and media_info.get('endTime'):
+        #     start_time = media_info['startTime'] # Assuming these keys exist and hold timestamps
+        #     end_time = media_info['endTime']
+        #     # Assuming m3u8_links contains a base URL that needs modification
+        #     base_url = m3u8_links.split('/')[0:-1] # Highly simplified base URL extraction
+        #     archive_url = '/'.join(base_url) + f'/index-archive.m3u8?startTime={start_time}&endTime={end_time}'
+        #     m3u8_links = archive_url # Replace the found link with the constructed one
+        # --- Potential modification area END ---
+
+
+        if not m3u8_links:
+            self.report_warning('Could not find any .m3u8 link in the page. The site structure may have changed.')
+            # Return basic info if no HLS manifest found
             return {
-                'id': final_id,
-                'title': 'European Parliament Stream', # Generic title for direct URLs
-                'formats': formats or [],
-                'subtitles': subtitles or {},
-                'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL
+                'id': media_info.get('id') or display_id,
+                'display_id': display_id,
+                'title': title,
+                'release_timestamp': release_timestamp,
+                'formats': [],
             }
 
-        # --- Fallback for multimedia.europarl.europa.eu URLs ---
-        if not display_id: # Should have display_id if it's not a direct HLS URL
-             raise ExtractorError('Failed to identify video ID from URL.')
-
-        self.to_screen(f'Processing webpage URL: {url}')
-        webpage = self._download_webpage(url, display_id)
-
-        # Check for live indicators more reliably
-        # Look for common live indicators in JS, classes, or text
-        is_live = bool(re.search(
-            r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)',
-            webpage,
-            re.IGNORECASE))
-        self.to_screen(f'Detected as live: {is_live}')
-
-        # Extract title
-        title = self._extract_title_from_webpage(webpage, display_id)
-
-        # *** Strategy 1: Extract direct URLs from webpage (Preferred) ***
-        direct_urls = self._extract_direct_url_from_webpage(webpage)
-        formats = []
-        subtitles = {}
-
-        if direct_urls:
-            self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...')
-            for m3u8_url in direct_urls:
-                # Clean stream ID from URL for format identification
-                m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0]
-                if m3u8_stream_id.endswith('.m3u8'):
-                    m3u8_stream_id = m3u8_stream_id[:-5]
-
-                try:
-                    fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                        m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error
-
-                    if fmt:
-                        self.to_screen(f'Successfully extracted formats from: {m3u8_url}')
-                        formats.extend(fmt)
-                        self._merge_subtitles(subs, target=subtitles)
-                        # If we found formats, we are likely done, return immediately
-                        return {
-                            'id': display_id,
-                            'display_id': display_id,
-                            'title': title,
-                            'formats': formats,
-                            'subtitles': subtitles,
-                            'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL
-                        }
-                    else:
-                        self.to_screen(f'No formats found in: {m3u8_url}')
-                except ExtractorError as e:
-                    self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}')
-                    pass # Try the next direct URL
-        else:
-            self.to_screen('No direct M3U8 URLs found in webpage.')
-
-
-        # *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) ***
-        self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...')
-
-        # Parse timestamps for archive retrieval (or use a window for live/unknown)
-        # Always parse, even if live, as it might be a recently finished live event
-        start_timestamp, end_timestamp = self._parse_meeting_date(display_id)
-
-        # Use appropriate stream filenames for the content type
-        stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES
-
-        # Try combinations with updated endpoints and channels
-        for endpoint in self.ENDPOINTS:
-            for channel_to_try in self.PRIORITY_CHANNELS:
-                for filename in stream_filenames:
-                    base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}"
-
-                    # Determine if timestamps should be added
-                    # Add timestamps if it's explicitly not live, OR if the filename suggests archive,
-                    # OR if start/end timestamps were successfully parsed from the ID.
-                    # Avoid timestamps for clearly live filenames unless forced by non-live status.
-                    use_timestamps = (
-                        (not is_live or 'archive' in filename.lower())
-                        and start_timestamp and end_timestamp
-                    )
-
-                    test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url
-
-                    try:
-                        self.to_screen(f'Trying guessed URL: {test_url}')
-                        fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                            test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False)
-
-                        if fmt:
-                            self.to_screen(f'Success with guessed URL: {test_url}')
-                            formats.extend(fmt)
-                            self._merge_subtitles(subs, target=subtitles)
-                            # Found a working combination
-                            return {
-                                'id': display_id,
-                                'display_id': display_id,
-                                'title': title,
-                                'formats': formats,
-                                'subtitles': subtitles,
-                                'is_live': not use_timestamps, # If we used timestamps, assume not live
-                            }
-                        else:
-                            self.to_screen(f'No formats found in guessed URL: {test_url}')
-
-                    except ExtractorError as e:
-                        # Log error lightly, as many guesses are expected to fail
-                        self.to_screen(f'Guessed URL failed: {test_url} ({e})')
-                        pass # Continue trying other combinations
-
-        # *** If all strategies fail ***
-        self.to_screen('All extraction strategies failed.')
-
-        # Provide helpful error with suggestions
-        error_message = (
-            f"Could not extract stream URL for {display_id or url}. "
-            "The stream may be old, expired, or use an unsupported format.\n"
-            f"Live status detected: {is_live}\n"
-            "Common issues:\n"
-            "- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n"
-            "- The event might not be available via the standard CDN endpoints/channels.\n"
-            "If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n"
-            "Example (using parsed times, adjust if needed):\n"
+        # Process all found .m3u8 links (handles case where multiple are found or the first one is a master playlist)
+        # The regex used here is identical to the one above, ensures we capture all instances
+        import re
+        all_links_text = self._html_search_regex(
+             r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)',
+             webpage, 'all m3u8 URLs', default='', fatal=False, group=0 # Find all occurrences
         )
-        if start_timestamp and end_timestamp:
-             example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-             error_message += f'yt-dlp "{example_url}"'
-        else:
-             example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8"
-             error_message += f'yt-dlp "{example_url}"'
+        candidates = re.findall(r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', all_links_text)
+
+        # If the specific constructed URL was made above, ensure it's prioritized or the only candidate
+        # (Refined logic needed here based on the modification above)
+        if not candidates and m3u8_links: # Fallback if findall failed but initial search worked
+             candidates = [m3u8_links]
+        elif m3u8_links not in candidates and m3u8_links: # Ensure the primary (possibly constructed) link is included
+             candidates.insert(0, m3u8_links)
+
+        candidates = list(dict.fromkeys(candidates)) # Make unique, preserving order
+
+        if not candidates: # Final check if still no candidates
+             self.report_warning('Could not extract any valid .m3u8 URLs.')
+             return {
+                 'id': media_info.get('id') or display_id,
+                 'display_id': display_id,
+                 'title': title,
+                 'release_timestamp': release_timestamp,
+                 'formats': [],
+             }
+
+
+        formats, subtitles = [], {}
+        for link in candidates:
+            # Pass the identified m3u8 URL (could be live, index-archive, or norsk-archive)
+            # The 'live' flag might need adjustment based on mediaSubType
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                link, display_id, ext='mp4', live=is_live, fatal=False) # Pass is_live status
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)
 
-
-        raise ExtractorError(error_message, expected=True)
+        return {
+            'id': media_info.get('id') or display_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'release_timestamp': release_timestamp,
+             # Report 'is_live' based on detected mediaSubType
+            'is_live': is_live or None # Report None if not explicitly Live
+        }

From db1f9be975388d6ace24db638a8d33d7ab947014 Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Fri, 28 Mar 2025 13:14:21 +0000
Subject: [PATCH 6/9] Update europa.py

---
 yt_dlp/extractor/europa.py | 253 ++++++++++++++++---------------------
 1 file changed, 107 insertions(+), 146 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index cd4cbf4dfd..9d3cfa5a80 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -1,32 +1,32 @@
+# -*- coding: utf-8 -*-
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError, # Import ExtractorError for raising specific errors
     int_or_none,
     orderedSet,
     parse_duration,
     parse_iso8601,
     parse_qs,
     qualities,
-    traverse_obj,
+    traverse_obj, # Useful for safely navigating nested dictionaries
     unified_strdate,
     xpath_text,
 )
+import re # Import re for findall
 
+# --- EuropaIE (Older extractor - unchanged) ---
+# This extractor handles older ec.europa.eu/avservices URLs and is likely defunct.
 class EuropaIE(InfoExtractor):
-    _WORKING = False
+    _WORKING = False # Marked as not working
     _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)'
     _TESTS = [{
         'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758',
         'md5': '574f080699ddd1e19a675b0ddf010371',
         'info_dict': {
-            'id': 'I107758',
-            'ext': 'mp4',
-            'title': 'TRADE - Wikileaks on TTIP',
+            'id': 'I107758', 'ext': 'mp4', 'title': 'TRADE - Wikileaks on TTIP',
             'description': 'NEW  LIVE EC Midday press briefing of 11/08/2015',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'upload_date': '20150811',
-            'duration': 34,
-            'view_count': int,
-            'formats': 'mincount:3',
+            'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150811',
+            'duration': 34, 'view_count': int, 'formats': 'mincount:3',
         },
     }, {
         'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786',
@@ -37,189 +37,150 @@ class EuropaIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
+        # (Implementation remains the same as previous versions)
         video_id = self._match_id(url)
-
         playlist = self._download_xml(
             f'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID={video_id}', video_id)
-
         def get_item(type_, preference):
             items = {}
             for item in playlist.findall(f'./info/{type_}/item'):
-                lang, label = (
-                    xpath_text(item, 'lg', default=None),
-                    xpath_text(item, 'label', default=None)
-                )
-                if lang and label:
-                    items[lang] = label.strip()
+                lang, label = (xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None))
+                if lang and label: items[lang] = label.strip()
             for p in preference:
-                if items.get(p):
-                    return items[p]
-
+                if items.get(p): return items[p]
         query = parse_qs(url)
         preferred_lang = query.get('sitelang', ('en', ))[0]
         preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
-
         title = get_item('title', preferred_langs) or video_id
         description = get_item('description', preferred_langs)
         thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail')
         upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))
         duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))
         view_count = int_or_none(xpath_text(playlist, './info/views', 'views'))
-
         language_preference = qualities(preferred_langs[::-1])
-
         formats = []
         for file_ in playlist.findall('./files/file'):
             video_url = xpath_text(file_, './url')
-            if not video_url:
-                continue
+            if not video_url: continue
             lang = xpath_text(file_, './lg')
-            formats.append({
-                'url': video_url,
-                'format_id': lang,
-                'format_note': xpath_text(file_, './lglabel'),
-                'language_preference': language_preference(lang),
-            })
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'duration': duration,
-            'view_count': view_count,
-            'formats': formats,
-        }
+            formats.append({'url': video_url, 'format_id': lang, 'format_note': xpath_text(file_, './lglabel'), 'language_preference': language_preference(lang)})
+        return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
 
 
+# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) ---
 class EuroParlWebstreamIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         https?://multimedia\.europarl\.europa\.eu/
-        (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)
+        (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format
     '''
     _TESTS = [{
+        # Existing VOD test
         'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
         'info_dict': {
-            'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d',
-            'display_id': '20220914-0900-PLENARY',
-            'ext': 'mp4',
-            'title': 'Plenary session',
-            'release_timestamp': 1663139069,
-            'release_date': '20220914',
-        },
-        'params': {
-            'skip_download': True,
+            'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY',
+            'ext': 'mp4', 'title': 'Plenary session', 'release_timestamp': 1663139069, 'release_date': '20220914',
         },
+        'params': {'skip_download': True},
     }, {
-        # example of old live webstream
-        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA',
+        # Test case that previously failed with regex method
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA',
         'info_dict': {
+            'id': str, # ID might be a string UUID or similar
+            'display_id': '20250328-1000-SPECIAL-EUROSCOLA',
             'ext': 'mp4',
-            'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715',
-            'release_timestamp': 1668502800,
-            'title': 'Euroscola 2022-11-15 19:21',
-            'release_date': '20221115',
-            'live_status': 'is_live',
+            'title': r're:Euroscola', # Expect title containing Euroscola
+            'release_timestamp': int, # Expecting a Unix timestamp
+            'release_date': '20250328',
+            'is_live': bool, # Could be True (if near event time) or False
         },
-        'skip': 'not live anymore',
+        'params': {'skip_download': True},
+        # Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly
     }]
 
     def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
+        display_id = self._match_id(url) # Get ID from URL
+        webpage = self._download_webpage(url, display_id) # Get page HTML
 
-        # Try to parse Next.js data for metadata
-        nextjs = self._search_nextjs_data(webpage, display_id, default={})
-        page_props = traverse_obj(nextjs, ('props', 'pageProps'), default={})
-        media_info = page_props.get('mediaItem') or {} # Look for start/end times here for archives?
+        # --- Extract Metadata (prioritize Next.js data) ---
+        nextjs_data = self._search_nextjs_data(webpage, display_id, default={})
+        # Use traverse_obj for safer nested dictionary access
+        media_info = traverse_obj(nextjs_data, ('props', 'pageProps', 'mediaItem')) or {}
 
+        # Extract basic info, falling back to display_id if metadata is sparse
+        internal_id = media_info.get('id') or display_id
         title = media_info.get('title') or media_info.get('name') or display_id
-        release_timestamp = None
-        # Existing logic uses startDateTime, might need adjustment for archive start/end
-        if 'startDateTime' in media_info:
-             release_timestamp = parse_iso8601(media_info['startDateTime'])
-
-        # Determine if it's Live or VOD/Archive (might need refinement)
-        # mediaSubType might be 'Live' or 'VOD' or something else
+        release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}))
+        # Determine live status based on metadata hint, if available
         is_live = media_info.get('mediaSubType') == 'Live'
 
-        # Search for any .m3u8 link first
-        m3u8_links = self._search_regex(
-            r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)',
-            webpage, 'm3u8 URL', default=None, group=1, fatal=False
-        )
-
-        # --- Potential modification area START ---
-        # If it's NOT live, and we have start/end times, and m3u8_links points to a live URL,
-        # try constructing the index-archive.m3u8 URL here.
-        # Example (conceptual - requires actual start/end times and base URL logic):
-        # if not is_live and media_info.get('startTime') and media_info.get('endTime'):
-        #     start_time = media_info['startTime'] # Assuming these keys exist and hold timestamps
-        #     end_time = media_info['endTime']
-        #     # Assuming m3u8_links contains a base URL that needs modification
-        #     base_url = m3u8_links.split('/')[0:-1] # Highly simplified base URL extraction
-        #     archive_url = '/'.join(base_url) + f'/index-archive.m3u8?startTime={start_time}&endTime={end_time}'
-        #     m3u8_links = archive_url # Replace the found link with the constructed one
-        # --- Potential modification area END ---
-
-
-        if not m3u8_links:
-            self.report_warning('Could not find any .m3u8 link in the page. The site structure may have changed.')
-            # Return basic info if no HLS manifest found
-            return {
-                'id': media_info.get('id') or display_id,
-                'display_id': display_id,
-                'title': title,
-                'release_timestamp': release_timestamp,
-                'formats': [],
-            }
-
-        # Process all found .m3u8 links (handles case where multiple are found or the first one is a master playlist)
-        # The regex used here is identical to the one above, ensures we capture all instances
-        import re
-        all_links_text = self._html_search_regex(
-             r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)',
-             webpage, 'all m3u8 URLs', default='', fatal=False, group=0 # Find all occurrences
-        )
-        candidates = re.findall(r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', all_links_text)
-
-        # If the specific constructed URL was made above, ensure it's prioritized or the only candidate
-        # (Refined logic needed here based on the modification above)
-        if not candidates and m3u8_links: # Fallback if findall failed but initial search worked
-             candidates = [m3u8_links]
-        elif m3u8_links not in candidates and m3u8_links: # Ensure the primary (possibly constructed) link is included
-             candidates.insert(0, m3u8_links)
-
-        candidates = list(dict.fromkeys(candidates)) # Make unique, preserving order
-
-        if not candidates: # Final check if still no candidates
-             self.report_warning('Could not extract any valid .m3u8 URLs.')
-             return {
-                 'id': media_info.get('id') or display_id,
-                 'display_id': display_id,
-                 'title': title,
-                 'release_timestamp': release_timestamp,
-                 'formats': [],
-             }
-
-
-        formats, subtitles = [], {}
-        for link in candidates:
-            # Pass the identified m3u8 URL (could be live, index-archive, or norsk-archive)
-            # The 'live' flag might need adjustment based on mediaSubType
-            fmts, subs = self._extract_m3u8_formats_and_subtitles(
-                link, display_id, ext='mp4', live=is_live, fatal=False) # Pass is_live status
-            formats.extend(fmts)
-            self._merge_subtitles(subs, target=subtitles)
-
+        hls_url = None # Variable to store the found HLS URL
+
+        # --- Attempt 1: Find direct HLS URL in media_info ---
+        # Check common dictionary keys where the full HLS URL might be stored.
+        # Add more potential keys here if observed in website data.
+        possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl')
+        hls_url = traverse_obj(media_info, possible_keys)
+        if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL
+            self.to_screen(f'Found direct HLS URL in metadata: {hls_url}')
+        else:
+            hls_url = None # Reset if found value wasn't an HLS URL
+
+        # --- Attempt 2: Construct HLS URL from IDs in media_info ---
+        if not hls_url:
+            self.to_screen('Attempting to construct HLS URL from metadata IDs...')
+            # Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common,
+            # but might differ. Use traverse_obj to safely get values.
+            # 'id' from media_info is often the event ID.
+            event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id'))
+            # Channel ID might be numeric or a string name.
+            channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel'))
+
+            if event_id and channel_id:
+                # Construct the URL using the assumed live/default pattern.
+                # For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed.
+                # This assumes the event is live or uses the default endpoint.
+                constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
+                hls_url = constructed_url
+                self.to_screen(f'Constructed potential HLS URL: {hls_url}')
+            else:
+                self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.')
+
+        # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) ---
+        if not hls_url:
+            self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...')
+            m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)'
+            hls_url = self._search_regex(
+                m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False)
+            if hls_url:
+                 self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
+            else:
+                # This is where the original "Could not find any .m3u8 link" warning occurred.
+                 self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
+
+        # --- Process HLS Playlist ---
+        if not hls_url:
+            # If no URL was found after all attempts, raise an error.
+             raise ExtractorError(
+                 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
+                 expected=True) # expected=True prevents stack trace for common errors
+
+        # Pass the found HLS URL to the HLS processing function.
+        # The _extract_m3u8_formats function usually detects live/VOD automatically.
+        # The 'live=is_live' hint can sometimes help but isn't strictly necessary.
+        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+            hls_url, display_id, ext='mp4', live=is_live, fatal=False)
+
+        # Check if HLS processing returned any formats
+        if not formats:
+             raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True)
+
+        # --- Return Extracted Information ---
         return {
-            'id': media_info.get('id') or display_id,
+            'id': internal_id,
             'display_id': display_id,
             'title': title,
             'formats': formats,
             'subtitles': subtitles,
             'release_timestamp': release_timestamp,
-             # Report 'is_live' based on detected mediaSubType
-            'is_live': is_live or None # Report None if not explicitly Live
+            'is_live': is_live or None, # Use None if not explicitly marked Live
         }

From d597dc61a220722699c2e90cec61b6943cbc8744 Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Fri, 28 Mar 2025 15:37:09 +0000
Subject: [PATCH 7/9] Update europa.py

---
 yt_dlp/extractor/europa.py | 104 +++++++++++++++++++++++--------------
 1 file changed, 66 insertions(+), 38 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index 9d3cfa5a80..c768b9f7d7 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -12,7 +12,8 @@ from ..utils import (
     unified_strdate,
     xpath_text,
 )
-import re # Import re for findall
+# Removed unused 're' import, added 'urllib.parse' for potential future use if needed
+# but not strictly required for current modification.
 
 # --- EuropaIE (Older extractor - unchanged) ---
 # This extractor handles older ec.europa.eu/avservices URLs and is likely defunct.
@@ -67,14 +68,14 @@ class EuropaIE(InfoExtractor):
         return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
 
 
-# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) ---
+# --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) ---
 class EuroParlWebstreamIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         https?://multimedia\.europarl\.europa\.eu/
         (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format
     '''
     _TESTS = [{
-        # Existing VOD test
+        # Existing VOD test (Should now work better if metadata is consistent)
         'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
         'info_dict': {
             'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY',
@@ -82,19 +83,18 @@ class EuroParlWebstreamIE(InfoExtractor):
         },
         'params': {'skip_download': True},
     }, {
-        # Test case that previously failed with regex method
+        # Test case likely representing an archive/VOD (based on previous context)
         'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA',
         'info_dict': {
             'id': str, # ID might be a string UUID or similar
             'display_id': '20250328-1000-SPECIAL-EUROSCOLA',
             'ext': 'mp4',
             'title': r're:Euroscola', # Expect title containing Euroscola
-            'release_timestamp': int, # Expecting a Unix timestamp
+            'release_timestamp': int, # Expecting a Unix timestamp (start time)
             'release_date': '20250328',
-            'is_live': bool, # Could be True (if near event time) or False
+            'is_live': False, # Should be detected as not live
         },
         'params': {'skip_download': True},
-        # Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly
     }]
 
     def _real_extract(self, url):
@@ -109,70 +109,98 @@ class EuroParlWebstreamIE(InfoExtractor):
         # Extract basic info, falling back to display_id if metadata is sparse
         internal_id = media_info.get('id') or display_id
         title = media_info.get('title') or media_info.get('name') or display_id
-        release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}))
+
+        # Extract start and end timestamps, if available
+        # parse_iso8601 typically returns a float/int timestamp
+        start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none}))
+        end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none}))
+        release_timestamp = start_timestamp # Use start time as the release timestamp
+
         # Determine live status based on metadata hint, if available
+        # Treat as not live if 'Live' subtype isn't explicitly present
         is_live = media_info.get('mediaSubType') == 'Live'
 
         hls_url = None # Variable to store the found HLS URL
 
         # --- Attempt 1: Find direct HLS URL in media_info ---
         # Check common dictionary keys where the full HLS URL might be stored.
-        # Add more potential keys here if observed in website data.
         possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl')
         hls_url = traverse_obj(media_info, possible_keys)
         if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL
             self.to_screen(f'Found direct HLS URL in metadata: {hls_url}')
+            # Check if it's an archive URL but missing time params - might need correction later if it fails
+            if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
+                 self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.')
+                 hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
+                 self.to_screen(f'Corrected direct HLS URL: {hls_url}')
+
         else:
-            hls_url = None # Reset if found value wasn't an HLS URL
+            hls_url = None # Reset if found value wasn't an HLS URL or needs construction
 
-        # --- Attempt 2: Construct HLS URL from IDs in media_info ---
+        # --- Attempt 2: Construct HLS URL from IDs and Times in media_info ---
         if not hls_url:
-            self.to_screen('Attempting to construct HLS URL from metadata IDs...')
-            # Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common,
-            # but might differ. Use traverse_obj to safely get values.
-            # 'id' from media_info is often the event ID.
+            self.to_screen('Attempting to construct HLS URL from metadata...')
             event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id'))
-            # Channel ID might be numeric or a string name.
             channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel'))
 
             if event_id and channel_id:
-                # Construct the URL using the assumed live/default pattern.
-                # For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed.
-                # This assumes the event is live or uses the default endpoint.
-                constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
-                hls_url = constructed_url
-                self.to_screen(f'Constructed potential HLS URL: {hls_url}')
+                if not is_live and start_timestamp and end_timestamp:
+                    # Construct ARCHIVE/VOD URL with time parameters
+                    constructed_url = (
+                        f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/'
+                        f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}'
+                    )
+                    hls_url = constructed_url
+                    self.to_screen(f'Constructed Archive HLS URL: {hls_url}')
+                elif is_live:
+                     # Construct LIVE URL (basic pattern, might need adjustments)
+                    constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
+                    hls_url = constructed_url
+                    self.to_screen(f'Constructed Live HLS URL: {hls_url}')
+                else:
+                    self.to_screen('Could not construct URL: Missing live status or timestamps for archive.')
             else:
-                self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.')
+                self.to_screen('Could not construct URL: Missing event or channel ID in metadata.')
 
         # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) ---
         if not hls_url:
             self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...')
-            m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)'
+            # Updated regex to potentially capture archive URLs with parameters, but prioritize construction
+            m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)'
             hls_url = self._search_regex(
                 m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False)
             if hls_url:
-                 self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
+                self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
+                # If regex found an archive URL without params, try adding them as a last resort
+                if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
+                    self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.')
+                    hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
+                    self.to_screen(f'Corrected regex HLS URL: {hls_url}')
             else:
-                # This is where the original "Could not find any .m3u8 link" warning occurred.
-                 self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
+                self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
 
         # --- Process HLS Playlist ---
         if not hls_url:
-            # If no URL was found after all attempts, raise an error.
-             raise ExtractorError(
-                 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
-                 expected=True) # expected=True prevents stack trace for common errors
-
-        # Pass the found HLS URL to the HLS processing function.
-        # The _extract_m3u8_formats function usually detects live/VOD automatically.
-        # The 'live=is_live' hint can sometimes help but isn't strictly necessary.
+            raise ExtractorError(
+                'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
+                expected=True)
+
+        # Pass the final HLS URL to the processing function
         formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-            hls_url, display_id, ext='mp4', live=is_live, fatal=False)
+            hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats
 
         # Check if HLS processing returned any formats
         if not formats:
-             raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True)
+             # Try again, forcing VOD interpretation if it was marked live but failed
+             if is_live:
+                 self.to_screen('Live HLS processing failed, attempting again as VOD...')
+                 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+                     hls_url, display_id, ext='mp4', live=False, fatal=False)
+
+             # If still no formats, raise error
+             if not formats:
+                 raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True)
+
 
         # --- Return Extracted Information ---
         return {
@@ -182,5 +210,5 @@ class EuroParlWebstreamIE(InfoExtractor):
             'formats': formats,
             'subtitles': subtitles,
             'release_timestamp': release_timestamp,
-            'is_live': is_live or None, # Use None if not explicitly marked Live
+            'is_live': is_live, # Keep original detected live status
         }

From 5711fa1dc853fba30b4e9652fab0a1344d830336 Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:24:48 +0000
Subject: [PATCH 8/9] Update europa.py

---
 yt_dlp/extractor/europa.py | 370 +++++++++++++++++++++----------------
 1 file changed, 214 insertions(+), 156 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index c768b9f7d7..d0f17c16fe 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -1,44 +1,33 @@
 # -*- coding: utf-8 -*-
 from .common import InfoExtractor
 from ..utils import (
-    ExtractorError, # Import ExtractorError for raising specific errors
+    ExtractorError,
     int_or_none,
     orderedSet,
     parse_duration,
     parse_iso8601,
     parse_qs,
     qualities,
-    traverse_obj, # Useful for safely navigating nested dictionaries
+    traverse_obj,
     unified_strdate,
     xpath_text,
+    js_to_json,
+    urljoin,
+    filter_dict,
+    HEADRequest, # Import HEADRequest
 )
-# Removed unused 're' import, added 'urllib.parse' for potential future use if needed
-# but not strictly required for current modification.
+import re
+import json
+import urllib.error # Import urllib.error for HEAD check exception
 
-# --- EuropaIE (Older extractor - unchanged) ---
-# This extractor handles older ec.europa.eu/avservices URLs and is likely defunct.
+# --- EuropaIE (Unchanged) ---
 class EuropaIE(InfoExtractor):
-    _WORKING = False # Marked as not working
+    _WORKING = False
     _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)'
-    _TESTS = [{
-        'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758',
-        'md5': '574f080699ddd1e19a675b0ddf010371',
-        'info_dict': {
-            'id': 'I107758', 'ext': 'mp4', 'title': 'TRADE - Wikileaks on TTIP',
-            'description': 'NEW  LIVE EC Midday press briefing of 11/08/2015',
-            'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150811',
-            'duration': 34, 'view_count': int, 'formats': 'mincount:3',
-        },
-    }, {
-        'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786',
-        'only_matching': True,
-    }, {
-        'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en',
-        'only_matching': True,
-    }]
-
+    _TESTS = [
+        # Existing tests...
+    ]
     def _real_extract(self, url):
-        # (Implementation remains the same as previous versions)
         video_id = self._match_id(url)
         playlist = self._download_xml(
             f'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID={video_id}', video_id)
@@ -68,147 +57,216 @@ class EuropaIE(InfoExtractor):
         return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
 
 
-# --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) ---
+# --- EuroParlWebstreamIE (Using JSON from iframe) ---
 class EuroParlWebstreamIE(InfoExtractor):
     _VALID_URL = r'''(?x)
-        https?://multimedia\.europarl\.europa\.eu/
-        (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format
+        https?://(?:
+            multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)| # Webstreaming page URL
+            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>\d+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?P<stream_type>index-archive|index|master|playlist|norsk-archive)(?:\.m3u8)? # Direct HLS URL base
+        )
     '''
-    _TESTS = [{
-        # Existing VOD test (Should now work better if metadata is consistent)
-        'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
-        'info_dict': {
-            'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY',
-            'ext': 'mp4', 'title': 'Plenary session', 'release_timestamp': 1663139069, 'release_date': '20220914',
+    _TESTS = [
+        {
+            'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-agriculture-and-rural-development_20250327-0900-COMMITTEE-AGRI',
+            'info_dict': {
+                'id': '20250327-0900-COMMITTEE-AGRI',
+                'title': r're:^Committee on Agriculture and Rural Development \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+                'is_live': False,
+                'ext': 'mp4',
+            },
+            'params': {'skip_download': True},
+            # Uses the iframe JSON parsing which should yield 2113752 / channel-06-bxl
+        },
+        {
+            'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/pre-session-briefing_20250328-1100-SPECIAL-PRESSEr',
+            'info_dict': {
+                'id': '20250328-1100-SPECIAL-PRESSEr',
+                'title': r're:^Pre-session briefing \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+                'is_live': False,
+                'ext': 'mp4',
+            },
+            'params': {'skip_download': True},
+            # Uses the iframe JSON parsing which should yield 2113747 / channel-01-bxl
         },
-        'params': {'skip_download': True},
-    }, {
-        # Test case likely representing an archive/VOD (based on previous context)
-        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA',
-        'info_dict': {
-            'id': str, # ID might be a string UUID or similar
-            'display_id': '20250328-1000-SPECIAL-EUROSCOLA',
-            'ext': 'mp4',
-            'title': r're:Euroscola', # Expect title containing Euroscola
-            'release_timestamp': int, # Expecting a Unix timestamp (start time)
-            'release_date': '20250328',
-            'is_live': False, # Should be detected as not live
+        { # Test direct HLS URL with archive times
+            'url': 'https://live.media.eup.glcloud.eu/hls/live/2113752/channel-06-bxl/index-archive.m3u8?startTime=1743068400&endTime=1743079800',
+            'info_dict': {
+                'id': 'index-archive',
+                'title': 'European Parliament Stream 2113752/channel-06-bxl',
+                'is_live': False, # Should be detected as not live from lack of live tags/duration
+                'ext': 'mp4',
+            },
+            'params': {'skip_download': True},
         },
-        'params': {'skip_download': True},
-    }]
+        # Potentially add a known live stream test if one is available
+    ]
+
+    def _log_debug(self, msg):
+        self.to_screen(f"[EuroParlWebstream] {msg}")
+
+    def _extract_title_from_webpage(self, webpage, display_id):
+        """Extracts title from the main webstreaming page."""
+        title_element = self._search_regex(r'<h1[^>]*>(.*?)</h1>', webpage, 'title element', default=None)
+        if title_element:
+            # Clean up potential extra whitespace and HTML entities
+            title = re.sub(r'\s+', ' ', title_element).strip()
+            title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=title)
+        else:
+            # Fallback using meta tags or just the ID
+            title = self._html_search_meta(
+                ['og:title', 'twitter:title'], webpage, default=display_id)
+        return title.replace('_', ' ') # Replace underscores often used in IDs
+
+    def _perform_head_check(self, url, display_id, note=''):
+        """Performs a HEAD request to check if the HLS URL likely exists."""
+        self._log_debug(f'[{display_id}] Performing HEAD check {note}on: {url}')
+        try:
+            self._request_webpage(HEADRequest(url), display_id, note=f'HEAD check {note}')
+            self._log_debug(f'[{display_id}] HEAD check {note}successful.')
+            return True
+        except ExtractorError as e:
+            # Specifically catch HTTP errors, especially 404
+            if isinstance(e.cause, urllib.error.HTTPError):
+                self._log_debug(f'[{display_id}] HEAD check {note}failed: {e.cause.code} {e.cause.reason}')
+            else:
+                self._log_debug(f'[{display_id}] HEAD check {note}failed: {e}')
+            return False
 
     def _real_extract(self, url):
-        display_id = self._match_id(url) # Get ID from URL
-        webpage = self._download_webpage(url, display_id) # Get page HTML
+        mobj = self._match_valid_url(url)
+        display_id = mobj.group('id')
+        live_id_direct = mobj.group('live_id')
+
+        # --- Handle Direct HLS URL Input ---
+        if live_id_direct:
+            self._log_debug(f"Processing Direct HLS URL: {url}")
+            channel_direct = mobj.group('channel')
+            stream_type_direct = mobj.group('stream_type') or 'stream' # Default name if not specified
+            base_url = f'https://live.media.eup.glcloud.eu/hls/live/{live_id_direct}/{channel_direct}/{stream_type_direct}'
 
-        # --- Extract Metadata (prioritize Next.js data) ---
+            query_params_str = mobj.group(0).split('?', 1)[1] if '?' in mobj.group(0) else None
+            query_params = parse_qs(query_params_str) if query_params_str else {}
+            start_time_direct = traverse_obj(query_params, ('startTime', 0, {int_or_none}))
+            end_time_direct = traverse_obj(query_params, ('endTime', 0, {int_or_none}))
+
+            # Construct the final URL ensuring .m3u8 is present
+            final_url = base_url + ('' if base_url.endswith('.m3u8') else '.m3u8')
+            if start_time_direct and end_time_direct:
+                 final_url += f"?startTime={start_time_direct}&endTime={end_time_direct}"
+            elif query_params_str: # Append original query if not start/end time based
+                 final_url += f"?{query_params_str}"
+
+            # Basic title for direct URL
+            title = f'European Parliament Stream {live_id_direct}/{channel_direct}'
+
+            # HEAD check is good even for direct URLs
+            if not self._perform_head_check(final_url, f"{live_id_direct}-{channel_direct}", '(direct)'):
+                 raise ExtractorError(f'Direct HLS URL HEAD check failed: {final_url}', expected=True)
+
+            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+                final_url, display_id or stream_type_direct, 'mp4', m3u8_id='hls', fatal=True)
+            if not formats: raise ExtractorError(f'Could not extract formats from direct HLS URL: {final_url}', expected=True)
+
+            return {
+                'id': display_id or stream_type_direct,
+                'title': title,
+                'formats': formats,
+                'subtitles': subtitles,
+                'is_live': not (start_time_direct and end_time_direct) and '.m3u8' not in stream_type_direct # Guess based on URL structure
+            }
+
+        # --- Handle Webstreaming Page URL ---
+        if not display_id: raise ExtractorError('Could not parse display ID from URL', expected=True)
+
+        self._log_debug(f"Processing Webstreaming Page: {display_id}")
+        webpage = self._download_webpage(url, display_id)
+        title = self._extract_title_from_webpage(webpage, display_id) # Get title early
+
+        self._log_debug(f'[{display_id}] Extracting metadata and iframe URL...')
         nextjs_data = self._search_nextjs_data(webpage, display_id, default={})
-        # Use traverse_obj for safer nested dictionary access
         media_info = traverse_obj(nextjs_data, ('props', 'pageProps', 'mediaItem')) or {}
 
-        # Extract basic info, falling back to display_id if metadata is sparse
-        internal_id = media_info.get('id') or display_id
-        title = media_info.get('title') or media_info.get('name') or display_id
-
-        # Extract start and end timestamps, if available
-        # parse_iso8601 typically returns a float/int timestamp
-        start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none}))
-        end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none}))
-        release_timestamp = start_timestamp # Use start time as the release timestamp
-
-        # Determine live status based on metadata hint, if available
-        # Treat as not live if 'Live' subtype isn't explicitly present
-        is_live = media_info.get('mediaSubType') == 'Live'
-
-        hls_url = None # Variable to store the found HLS URL
-
-        # --- Attempt 1: Find direct HLS URL in media_info ---
-        # Check common dictionary keys where the full HLS URL might be stored.
-        possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl')
-        hls_url = traverse_obj(media_info, possible_keys)
-        if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL
-            self.to_screen(f'Found direct HLS URL in metadata: {hls_url}')
-            # Check if it's an archive URL but missing time params - might need correction later if it fails
-            if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
-                 self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.')
-                 hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
-                 self.to_screen(f'Corrected direct HLS URL: {hls_url}')
+        # Get initial start time, but prioritize iframe JSON later
+        initial_start_timestamp = traverse_obj(media_info, ('mediaDate', {parse_iso8601}, {int_or_none}))
+        iframe_url = traverse_obj(media_info, 'iframeUrls') # Usually just one URL string
 
-        else:
-            hls_url = None # Reset if found value wasn't an HLS URL or needs construction
-
-        # --- Attempt 2: Construct HLS URL from IDs and Times in media_info ---
-        if not hls_url:
-            self.to_screen('Attempting to construct HLS URL from metadata...')
-            event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id'))
-            channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel'))
-
-            if event_id and channel_id:
-                if not is_live and start_timestamp and end_timestamp:
-                    # Construct ARCHIVE/VOD URL with time parameters
-                    constructed_url = (
-                        f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/'
-                        f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}'
-                    )
-                    hls_url = constructed_url
-                    self.to_screen(f'Constructed Archive HLS URL: {hls_url}')
-                elif is_live:
-                     # Construct LIVE URL (basic pattern, might need adjustments)
-                    constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
-                    hls_url = constructed_url
-                    self.to_screen(f'Constructed Live HLS URL: {hls_url}')
-                else:
-                    self.to_screen('Could not construct URL: Missing live status or timestamps for archive.')
-            else:
-                self.to_screen('Could not construct URL: Missing event or channel ID in metadata.')
-
-        # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) ---
-        if not hls_url:
-            self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...')
-            # Updated regex to potentially capture archive URLs with parameters, but prioritize construction
-            m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)'
-            hls_url = self._search_regex(
-                m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False)
-            if hls_url:
-                self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
-                # If regex found an archive URL without params, try adding them as a last resort
-                if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
-                    self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.')
-                    hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
-                    self.to_screen(f'Corrected regex HLS URL: {hls_url}')
+        self._log_debug(f'[{display_id}] Initial Start Time={initial_start_timestamp}, Iframe URL={iframe_url}')
+
+        if not iframe_url:
+            raise ExtractorError(f'[{display_id}] Could not find iframe URL in page metadata.', expected=True)
+
+        # --- Attempt Extraction from Iframe JSON ---
+        self._log_debug(f'[{display_id}] Attempting extraction from iframe: {iframe_url}')
+        try:
+            iframe_content = self._download_webpage(iframe_url, display_id, note='Downloading iframe content')
+            json_data_str = self._search_regex(
+                r'<script id="ng-state" type="application/json"[^>]*>\s*({.+?})\s*</script>',
+                iframe_content, 'iframe JSON data', default=None)
+
+            if not json_data_str:
+                raise ExtractorError('Could not find ng-state JSON in iframe content.')
+
+            iframe_json = self._parse_json(json_data_str, display_id, fatal=True)
+
+            # Extract required info from the JSON structure
+            player_url_base = traverse_obj(iframe_json, ('contentEventKey', 'playerUrl'))
+            start_time = traverse_obj(iframe_json, ('contentEventKey', 'startTime', {int_or_none}))
+            end_time = traverse_obj(iframe_json, ('contentEventKey', 'endTime', {int_or_none}))
+            is_live = traverse_obj(iframe_json, ('contentEventKey', 'live')) # boolean
+            # Use title from JSON if available and seems better
+            json_title = traverse_obj(iframe_json, ('contentEventKey', 'title'))
+            if json_title: title = json_title
+
+
+            self._log_debug(f'[{display_id}] Found in iframe JSON: playerUrl={player_url_base}, startTime={start_time}, endTime={end_time}, is_live={is_live}')
+
+            if not player_url_base:
+                raise ExtractorError('Could not extract playerUrl from iframe JSON.')
+
+            # For recorded streams (archives), startTime and endTime are essential
+            if not is_live and (start_time is None or end_time is None):
+                 raise ExtractorError('Missing startTime or endTime in iframe JSON for recorded stream.')
+
+            # Construct the final URL
+            # Ensure base URL doesn't already have query params before adding ours
+            player_url_base = player_url_base.split('?')[0]
+            if not player_url_base.endswith('.m3u8'):
+                player_url_base += '.m3u8' # Ensure correct extension
+
+            if is_live:
+                 final_player_url = player_url_base # Live streams don't use start/end times
             else:
-                self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
-
-        # --- Process HLS Playlist ---
-        if not hls_url:
-            raise ExtractorError(
-                'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
-                expected=True)
-
-        # Pass the final HLS URL to the processing function
-        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-            hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats
-
-        # Check if HLS processing returned any formats
-        if not formats:
-             # Try again, forcing VOD interpretation if it was marked live but failed
-             if is_live:
-                 self.to_screen('Live HLS processing failed, attempting again as VOD...')
-                 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                     hls_url, display_id, ext='mp4', live=False, fatal=False)
-
-             # If still no formats, raise error
-             if not formats:
-                 raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True)
-
-
-        # --- Return Extracted Information ---
-        return {
-            'id': internal_id,
-            'display_id': display_id,
-            'title': title,
-            'formats': formats,
-            'subtitles': subtitles,
-            'release_timestamp': release_timestamp,
-            'is_live': is_live, # Keep original detected live status
-        }
+                 final_player_url = f"{player_url_base}?startTime={start_time}&endTime={end_time}"
+
+            # Perform HEAD check on the constructed URL
+            if not self._perform_head_check(final_player_url, display_id, '(dynamic)'):
+                 raise ExtractorError(f'Dynamic HLS URL from iframe failed HEAD check: {final_player_url}')
+
+            # Extract formats
+            self._log_debug(f'[{display_id}] Extracting formats from {final_player_url}')
+            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+                final_player_url, display_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=True) # Use fatal=True, if extraction fails, it's an error
+
+            if not formats:
+                 raise ExtractorError(f'Could not extract M3U8 formats from {final_player_url}', expected=True)
+
+            return {
+                'id': display_id,
+                'title': title,
+                'formats': formats,
+                'subtitles': subtitles,
+                'is_live': is_live,
+                'timestamp': start_time if not is_live else None, # Use JSON start time for VOD
+                'duration': (end_time - start_time) if not is_live and start_time and end_time else None,
+            }
+
+        except ExtractorError as e:
+            # Re-raise specific extractor errors
+            raise e
+        except Exception as e:
+            # Wrap unexpected errors
+            raise ExtractorError(f'[{display_id}] Error processing iframe content: {e}', cause=e)
+
+        # This part should ideally not be reached if iframe extraction is mandatory
+        raise ExtractorError(f'[{display_id}] Failed to extract stream information from iframe.', expected=True)

From 56124b0ac4bc8a2c24473569e23663e081370929 Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Mon, 31 Mar 2025 17:35:50 +0100
Subject: [PATCH 9/9] changed based on D Trombett repo

https://github.com/yt-dlp/yt-dlp/pull/12775/commits/c747e15cdf65ec4bf00f80f4cccd92832bd720fd
---
 yt_dlp/extractor/europa.py | 432 ++++++++++++++++++-------------------
 1 file changed, 208 insertions(+), 224 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index d0f17c16fe..58b41816ee 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -1,272 +1,256 @@
-# -*- coding: utf-8 -*-
 from .common import InfoExtractor
 from ..utils import (
-    ExtractorError,
     int_or_none,
     orderedSet,
     parse_duration,
     parse_iso8601,
     parse_qs,
     qualities,
+    str_or_none,
     traverse_obj,
     unified_strdate,
+    url_or_none,
     xpath_text,
-    js_to_json,
-    urljoin,
-    filter_dict,
-    HEADRequest, # Import HEADRequest
 )
-import re
-import json
-import urllib.error # Import urllib.error for HEAD check exception
 
-# --- EuropaIE (Unchanged) ---
+
 class EuropaIE(InfoExtractor):
     _WORKING = False
     _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)'
-    _TESTS = [
-        # Existing tests...
-    ]
+    _TESTS = [{
+        'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758',
+        'md5': '574f080699ddd1e19a675b0ddf010371',
+        'info_dict': {
+            'id': 'I107758',
+            'ext': 'mp4',
+            'title': 'TRADE - Wikileaks on TTIP',
+            'description': 'NEW  LIVE EC Midday press briefing of 11/08/2015',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20150811',
+            'duration': 34,
+            'view_count': int,
+            'formats': 'mincount:3',
+        },
+    }, {
+        'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786',
+        'only_matching': True,
+    }, {
+        'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en',
+        'only_matching': True,
+    }]
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
+
         playlist = self._download_xml(
             f'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID={video_id}', video_id)
+
         def get_item(type_, preference):
             items = {}
             for item in playlist.findall(f'./info/{type_}/item'):
-                lang, label = (xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None))
-                if lang and label: items[lang] = label.strip()
+                lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
+                if lang and label:
+                    items[lang] = label.strip()
             for p in preference:
-                if items.get(p): return items[p]
+                if items.get(p):
+                    return items[p]
+
         query = parse_qs(url)
         preferred_lang = query.get('sitelang', ('en', ))[0]
+
         preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
+
         title = get_item('title', preferred_langs) or video_id
         description = get_item('description', preferred_langs)
         thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail')
         upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))
         duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))
         view_count = int_or_none(xpath_text(playlist, './info/views', 'views'))
+
         language_preference = qualities(preferred_langs[::-1])
+
         formats = []
         for file_ in playlist.findall('./files/file'):
             video_url = xpath_text(file_, './url')
-            if not video_url: continue
+            if not video_url:
+                continue
             lang = xpath_text(file_, './lg')
-            formats.append({'url': video_url, 'format_id': lang, 'format_note': xpath_text(file_, './lglabel'), 'language_preference': language_preference(lang)})
-        return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
+            formats.append({
+                'url': video_url,
+                'format_id': lang,
+                'format_note': xpath_text(file_, './lglabel'),
+                'language_preference': language_preference(lang),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
 
 
-# --- EuroParlWebstreamIE (Using JSON from iframe) ---
 class EuroParlWebstreamIE(InfoExtractor):
     _VALID_URL = r'''(?x)
-        https?://(?:
-            multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)| # Webstreaming page URL
-            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>\d+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?P<stream_type>index-archive|index|master|playlist|norsk-archive)(?:\.m3u8)? # Direct HLS URL base
-        )
+        https?://multimedia\.europarl\.europa\.eu/
+        (?P<lang>[^/]*/)?webstreaming/(?:[^_]*_)?(?P<id>[\w-]+)
     '''
-    _TESTS = [
-        {
-            'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-agriculture-and-rural-development_20250327-0900-COMMITTEE-AGRI',
-            'info_dict': {
-                'id': '20250327-0900-COMMITTEE-AGRI',
-                'title': r're:^Committee on Agriculture and Rural Development \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
-                'is_live': False,
-                'ext': 'mp4',
-            },
-            'params': {'skip_download': True},
-            # Uses the iframe JSON parsing which should yield 2113752 / channel-06-bxl
+    _TESTS = [{
+        'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
+        'md5': '16420ad9c602663759538ac1ca16a8db',
+        'info_dict': {
+            'id': '20220914-0900-PLENARY',
+            'ext': 'mp4',
+            'title': 'Plenary session',
+            'description': '',
+            'duration': 45147,
+            'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png',
+            'release_timestamp': 1663139069,
+            'release_date': '20220914',
+            'modified_timestamp': 1663650921,
+            'modified_date': '20220920',
+            'live_status': 'was_live',
         },
-        {
-            'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/pre-session-briefing_20250328-1100-SPECIAL-PRESSEr',
-            'info_dict': {
-                'id': '20250328-1100-SPECIAL-PRESSEr',
-                'title': r're:^Pre-session briefing \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
-                'is_live': False,
-                'ext': 'mp4',
-            },
-            'params': {'skip_download': True},
-            # Uses the iframe JSON parsing which should yield 2113747 / channel-01-bxl
+    }, {
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA',
+        'md5': '8b4304f9e15a6e133100248fb55a5dce',
+        'info_dict': {
+            'ext': 'mp4',
+            'id': '20221115-1000-SPECIAL-EUROSCOLA',
+            'release_timestamp': 1668502798,
+            'title': 'Euroscola',
+            'release_date': '20221115',
+            'live_status': 'was_live',
+            'description': '',
+            'duration': 9587,
+            'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png',
+            'modified_timestamp': 1668945274,
+            'modified_date': '20221120',
         },
-        { # Test direct HLS URL with archive times
-            'url': 'https://live.media.eup.glcloud.eu/hls/live/2113752/channel-06-bxl/index-archive.m3u8?startTime=1743068400&endTime=1743079800',
-            'info_dict': {
-                'id': 'index-archive',
-                'title': 'European Parliament Stream 2113752/channel-06-bxl',
-                'is_live': False, # Should be detected as not live from lack of live tags/duration
-                'ext': 'mp4',
-            },
-            'params': {'skip_download': True},
+    }, {
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT',
+        'md5': '0ca01cf33009d866e6f5e1cd3088c10c',
+        'info_dict': {
+            'id': '20230301-1130-COMMITTEE-CULT',
+            'ext': 'mp4',
+            'release_date': '20230301',
+            'title': 'Committee on Culture and Education',
+            'release_timestamp': 1677666641,
+            'description': 'Committee on Culture and Education',
+            'duration': 1003,
+            'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png',
+            'modified_timestamp': 1732475771,
+            'modified_date': '20241124',
+            'live_status': 'was_live',
         },
-        # Potentially add a known live stream test if one is available
-    ]
-
-    def _log_debug(self, msg):
-        self.to_screen(f"[EuroParlWebstream] {msg}")
-
-    def _extract_title_from_webpage(self, webpage, display_id):
-        """Extracts title from the main webstreaming page."""
-        title_element = self._search_regex(r'<h1[^>]*>(.*?)</h1>', webpage, 'title element', default=None)
-        if title_element:
-            # Clean up potential extra whitespace and HTML entities
-            title = re.sub(r'\s+', ' ', title_element).strip()
-            title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=title)
-        else:
-            # Fallback using meta tags or just the ID
-            title = self._html_search_meta(
-                ['og:title', 'twitter:title'], webpage, default=display_id)
-        return title.replace('_', ' ') # Replace underscores often used in IDs
-
-    def _perform_head_check(self, url, display_id, note=''):
-        """Performs a HEAD request to check if the HLS URL likely exists."""
-        self._log_debug(f'[{display_id}] Performing HEAD check {note}on: {url}')
-        try:
-            self._request_webpage(HEADRequest(url), display_id, note=f'HEAD check {note}')
-            self._log_debug(f'[{display_id}] HEAD check {note}successful.')
-            return True
-        except ExtractorError as e:
-            # Specifically catch HTTP errors, especially 404
-            if isinstance(e.cause, urllib.error.HTTPError):
-                self._log_debug(f'[{display_id}] HEAD check {note}failed: {e.cause.code} {e.cause.reason}')
-            else:
-                self._log_debug(f'[{display_id}] HEAD check {note}failed: {e}')
-            return False
+    }, {
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI',
+        'md5': 'f2e8c30935f956a7165c2f4f4b4ee090',
+        'info_dict': {
+            'id': '20230524-0900-COMMITTEE-ENVI',
+            'ext': 'mp4',
+            'release_date': '20230524',
+            'title': 'Committee on Environment, Public Health and Food Safety',
+            'release_timestamp': 1684912288,
+            'live_status': 'was_live',
+            'description': 'Committee on Environment, Public Health and Food Safety',
+            'duration': 4831,
+            'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png',
+            'modified_timestamp': 1732475771,
+            'modified_date': '20241124',
+        },
+    }, {
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20240320-1345-SPECIAL-PRESSER',
+        'md5': '518758eb706471c4c4ef3a134034a5bd',
+        'info_dict': {
+            'id': '20240320-1345-SPECIAL-PRESSER',
+            'ext': 'mp4',
+            'release_date': '20240320',
+            'title': 'md5:7c6c814cac55dea5e2d87bf8d3db2234',
+            'release_timestamp': 1710939767,
+            'description': 'md5:7c6c814cac55dea5e2d87bf8d3db2234',
+            'duration': 927,
+            'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png',
+            'modified_timestamp': 1732475771,
+            'modified_date': '20241124',
+            'live_status': 'was_live',
+        },
+    }, {
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20250328-1600-SPECIAL-PRESSER',
+        'md5': 'dd1c5e67eb55e609998583d7c2966105',
+        'info_dict': {
+            'id': '20250328-1600-SPECIAL-PRESSER',
+            'ext': 'mp4',
+            'title': 'md5:04a2ab70c183dabe891a7cd190c3121d',
+            'description': '',
+            'duration': 1023,
+            'thumbnail': 'https://storage.eup.glcloud.eu/thumbnail/default_thumbnail.png',
+            'release_timestamp': 1743177199,
+            'release_date': '20250328',
+            'modified_timestamp': 1743180924,
+            'modified_date': '20250328',
+            'live_status': 'was_live',
+        },
+    }, {
+        'url': 'https://multimedia.europarl.europa.eu/webstreaming/briefing-for-media-on-2024-european-elections_20240429-1000-SPECIAL-OTHER',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        mobj = self._match_valid_url(url)
-        display_id = mobj.group('id')
-        live_id_direct = mobj.group('live_id')
-
-        # --- Handle Direct HLS URL Input ---
-        if live_id_direct:
-            self._log_debug(f"Processing Direct HLS URL: {url}")
-            channel_direct = mobj.group('channel')
-            stream_type_direct = mobj.group('stream_type') or 'stream' # Default name if not specified
-            base_url = f'https://live.media.eup.glcloud.eu/hls/live/{live_id_direct}/{channel_direct}/{stream_type_direct}'
-
-            query_params_str = mobj.group(0).split('?', 1)[1] if '?' in mobj.group(0) else None
-            query_params = parse_qs(query_params_str) if query_params_str else {}
-            start_time_direct = traverse_obj(query_params, ('startTime', 0, {int_or_none}))
-            end_time_direct = traverse_obj(query_params, ('endTime', 0, {int_or_none}))
-
-            # Construct the final URL ensuring .m3u8 is present
-            final_url = base_url + ('' if base_url.endswith('.m3u8') else '.m3u8')
-            if start_time_direct and end_time_direct:
-                 final_url += f"?startTime={start_time_direct}&endTime={end_time_direct}"
-            elif query_params_str: # Append original query if not start/end time based
-                 final_url += f"?{query_params_str}"
-
-            # Basic title for direct URL
-            title = f'European Parliament Stream {live_id_direct}/{channel_direct}'
-
-            # HEAD check is good even for direct URLs
-            if not self._perform_head_check(final_url, f"{live_id_direct}-{channel_direct}", '(direct)'):
-                 raise ExtractorError(f'Direct HLS URL HEAD check failed: {final_url}', expected=True)
-
-            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                final_url, display_id or stream_type_direct, 'mp4', m3u8_id='hls', fatal=True)
-            if not formats: raise ExtractorError(f'Could not extract formats from direct HLS URL: {final_url}', expected=True)
-
-            return {
-                'id': display_id or stream_type_direct,
-                'title': title,
-                'formats': formats,
-                'subtitles': subtitles,
-                'is_live': not (start_time_direct and end_time_direct) and '.m3u8' not in stream_type_direct # Guess based on URL structure
-            }
-
-        # --- Handle Webstreaming Page URL ---
-        if not display_id: raise ExtractorError('Could not parse display ID from URL', expected=True)
-
-        self._log_debug(f"Processing Webstreaming Page: {display_id}")
-        webpage = self._download_webpage(url, display_id)
-        title = self._extract_title_from_webpage(webpage, display_id) # Get title early
-
-        self._log_debug(f'[{display_id}] Extracting metadata and iframe URL...')
-        nextjs_data = self._search_nextjs_data(webpage, display_id, default={})
-        media_info = traverse_obj(nextjs_data, ('props', 'pageProps', 'mediaItem')) or {}
-
-        # Get initial start time, but prioritize iframe JSON later
-        initial_start_timestamp = traverse_obj(media_info, ('mediaDate', {parse_iso8601}, {int_or_none}))
-        iframe_url = traverse_obj(media_info, 'iframeUrls') # Usually just one URL string
-
-        self._log_debug(f'[{display_id}] Initial Start Time={initial_start_timestamp}, Iframe URL={iframe_url}')
-
-        if not iframe_url:
-            raise ExtractorError(f'[{display_id}] Could not find iframe URL in page metadata.', expected=True)
-
-        # --- Attempt Extraction from Iframe JSON ---
-        self._log_debug(f'[{display_id}] Attempting extraction from iframe: {iframe_url}')
-        try:
-            iframe_content = self._download_webpage(iframe_url, display_id, note='Downloading iframe content')
-            json_data_str = self._search_regex(
-                r'<script id="ng-state" type="application/json"[^>]*>\s*({.+?})\s*</script>',
-                iframe_content, 'iframe JSON data', default=None)
-
-            if not json_data_str:
-                raise ExtractorError('Could not find ng-state JSON in iframe content.')
-
-            iframe_json = self._parse_json(json_data_str, display_id, fatal=True)
-
-            # Extract required info from the JSON structure
-            player_url_base = traverse_obj(iframe_json, ('contentEventKey', 'playerUrl'))
-            start_time = traverse_obj(iframe_json, ('contentEventKey', 'startTime', {int_or_none}))
-            end_time = traverse_obj(iframe_json, ('contentEventKey', 'endTime', {int_or_none}))
-            is_live = traverse_obj(iframe_json, ('contentEventKey', 'live')) # boolean
-            # Use title from JSON if available and seems better
-            json_title = traverse_obj(iframe_json, ('contentEventKey', 'title'))
-            if json_title: title = json_title
-
-
-            self._log_debug(f'[{display_id}] Found in iframe JSON: playerUrl={player_url_base}, startTime={start_time}, endTime={end_time}, is_live={is_live}')
-
-            if not player_url_base:
-                raise ExtractorError('Could not extract playerUrl from iframe JSON.')
-
-            # For recorded streams (archives), startTime and endTime are essential
-            if not is_live and (start_time is None or end_time is None):
-                 raise ExtractorError('Missing startTime or endTime in iframe JSON for recorded stream.')
-
-            # Construct the final URL
-            # Ensure base URL doesn't already have query params before adding ours
-            player_url_base = player_url_base.split('?')[0]
-            if not player_url_base.endswith('.m3u8'):
-                player_url_base += '.m3u8' # Ensure correct extension
-
-            if is_live:
-                 final_player_url = player_url_base # Live streams don't use start/end times
-            else:
-                 final_player_url = f"{player_url_base}?startTime={start_time}&endTime={end_time}"
-
-            # Perform HEAD check on the constructed URL
-            if not self._perform_head_check(final_player_url, display_id, '(dynamic)'):
-                 raise ExtractorError(f'Dynamic HLS URL from iframe failed HEAD check: {final_player_url}')
-
-            # Extract formats
-            self._log_debug(f'[{display_id}] Extracting formats from {final_player_url}')
-            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                final_player_url, display_id, 'mp4', entry_protocol='m3u8_native',
-                m3u8_id='hls', fatal=True) # Use fatal=True, if extraction fails, it's an error
-
-            if not formats:
-                 raise ExtractorError(f'Could not extract M3U8 formats from {final_player_url}', expected=True)
-
-            return {
-                'id': display_id,
-                'title': title,
-                'formats': formats,
-                'subtitles': subtitles,
-                'is_live': is_live,
-                'timestamp': start_time if not is_live else None, # Use JSON start time for VOD
-                'duration': (end_time - start_time) if not is_live and start_time and end_time else None,
-            }
-
-        except ExtractorError as e:
-            # Re-raise specific extractor errors
-            raise e
-        except Exception as e:
-            # Wrap unexpected errors
-            raise ExtractorError(f'[{display_id}] Error processing iframe content: {e}', cause=e)
-
-        # This part should ideally not be reached if iframe extraction is mandatory
-        raise ExtractorError(f'[{display_id}] Failed to extract stream information from iframe.', expected=True)
+        lang, video_id = self._match_valid_url(url).group('lang', 'id')
+        query = {
+            'lang': lang,
+            'audio': lang,
+            'autoplay': 'true',
+            'logo': 'false',
+            'muted': 'false',
+            'fullscreen': 'true',
+            'disclaimer': 'false',
+            'multicast': 'true',
+            'analytics': 'false',
+        }
+        webpage = self._download_webpage(f'https://control.eup.glcloud.eu/content-manager/content-page/{video_id}',
+                                         video_id, 'Downloading iframe', query=query)
+        stream_info = self._search_json(r'<script [^>]*id="ng-state"[^>]*>', webpage, 'stream info', video_id)['contentEventKey']
+        player_url = stream_info.get('playerUrl')
+        # status = traverse_obj(stream_info, ('media_item', 'mediaSubType'))
+        # base = 'https://control.eup.glcloud.eu/content-manager/api/v1/socket.io/?EIO=4&transport=polling'
+        # headers = {'referer': f'https://control.eup.glcloud.eu/content-manager/content-page/{video_id}'}
+        # sid = self._download_socket_json(base, video_id, note='Opening socket', headers=headers)['sid']
+        # base += '&sid=' + sid
+        # self._download_webpage(base, video_id, 'Polling socket with payload', data=b'40/content,', headers=headers)
+        # self._download_webpage(base, video_id, 'Polling socket', headers=headers)
+        # self._download_socket_json(base, video_id, 'Getting broadcast metadata from socket', headers=headers)
+        if player_url:
+            live_status = 'was_live'
+            query = None if stream_info.get('finalVod') else traverse_obj(stream_info, {
+                'startTime': ('startTime', {str_or_none}),
+                'endTime': ('endTime', {str_or_none}),
+            })
+            formats, subtitles = self._extract_m3u8_formats_and_subtitles(player_url, video_id, query=query, ext='mp4')
+        else:
+            formats = None
+            subtitles = None
+            live_status = 'is_upcoming'
+            self.raise_no_formats('Stream didn\'t start yet', True, video_id)
+        if stream_info.get('live'):
+            live_status = 'is_live'
+
+        return {
+            'formats': formats,
+            'subtitles': subtitles,
+            'live_status': live_status,
+            **traverse_obj(stream_info, {
+                'id': ('commonId', {str_or_none}),
+                'title': ('title', {str_or_none}),
+                'description': ('description', {str_or_none}),
+                'release_timestamp': ('startTime', {int_or_none}),
+                'duration': ('endTime', {lambda e: e and (s := stream_info.get('startTime')) and (e - s)}),
+                'thumbnail': ('posterFrame', {url_or_none}),
+                'modified_timestamp': ('meta', 'updatedAt', {parse_iso8601}),
+            }),
+        }