From 2afe7e2fa3b21ab6f6879a05a06043eedbc864eb Mon Sep 17 00:00:00 2001
From: edmundman <45210014+edmundman@users.noreply.github.com>
Date: Tue, 25 Mar 2025 16:02:51 +0000
Subject: [PATCH] Update europa.py

this makes it work with videos from the archive but not live videos
---
 yt_dlp/extractor/europa.py | 205 ++++++++++++++-----------------------
 1 file changed, 76 insertions(+), 129 deletions(-)

diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py
index 7470305e9..c3a03bf59 100644
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@@ -10,10 +10,13 @@ from ..utils import (
     unified_strdate,
     xpath_text,
     ExtractorError,
+    js_to_json,
+    urljoin
 )
 import re
-import datetime
+import json
 import time
+import datetime
 
 
 class EuropaIE(InfoExtractor):
@@ -132,7 +135,6 @@ class EuroParlWebstreamIE(InfoExtractor):
             'display_id': '20250324-1500-COMMITTEE-HOUS',
             'ext': 'mp4',
             'title': 'Special committee on the Housing Crisis in the European Union Ordinary meeting',
-            'is_live': False,
         },
         'params': {
             'skip_download': True,
@@ -140,21 +142,13 @@ class EuroParlWebstreamIE(InfoExtractor):
     }]
 
     # Known working stream IDs (in order of likely success)
-    _ARCHIVE_STREAM_IDS = [
+    KNOWN_STREAM_IDS = [
         "index-archive",
         "norsk-archive",
     ]
-    
-    # Live stream IDs
-    _LIVE_STREAM_IDS = [
-        "index",
-        "master",
-        "playlist",
-        "norsk",
-    ]
 
     # Known CDN endpoints (in order of likely success)
-    _ENDPOINTS = [
+    KNOWN_ENDPOINTS = [
         "2113753",  # This appears to be the main endpoint
         "2113749",
         "2113750",
@@ -164,7 +158,7 @@ class EuroParlWebstreamIE(InfoExtractor):
     ]
 
     # Prioritized channel list based on observations (channel-07-bxl is often used)
-    _CHANNELS = [
+    PRIORITIZED_CHANNELS = [
         "channel-07-bxl",  # Most common based on examples
         "channel-03-bxl",  # Also seen in examples
         "channel-01-bxl",
@@ -179,6 +173,7 @@ class EuroParlWebstreamIE(InfoExtractor):
 
     def _parse_meeting_id(self, display_id):
         """Extract date and time information from the meeting ID."""
+        # Format: YYYYMMDD-HHMM-COMMITTEE-TYPE
         date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
         if date_match:
             date_str, time_str, meeting_type = date_match.groups()
@@ -196,14 +191,6 @@ class EuroParlWebstreamIE(InfoExtractor):
                 # Calculate a reasonable meeting duration (2 hours by default)
                 end_dt = meeting_dt + datetime.timedelta(hours=2)
                 
-                # Check if meeting is today or in the future (potential live stream)
-                now = datetime.datetime.now()
-                is_today = (meeting_dt.year == now.year and 
-                           meeting_dt.month == now.month and 
-                           meeting_dt.day == now.day)
-                is_future = meeting_dt > now
-                is_recent_past = now - meeting_dt < datetime.timedelta(hours=6)
-                
                 return {
                     'date': date_str,
                     'time': time_str,
@@ -212,10 +199,6 @@ class EuroParlWebstreamIE(InfoExtractor):
                     'end_dt': end_dt,
                     'start_timestamp': int(meeting_dt.timestamp()),
                     'end_timestamp': int(end_dt.timestamp()),
-                    'is_today': is_today,
-                    'is_future': is_future,
-                    'is_recent_past': is_recent_past,
-                    'is_live_candidate': is_today or is_future or is_recent_past,
                 }
             except (ValueError, OverflowError) as e:
                 self.report_warning(f"Failed to parse meeting date/time: {e}")
@@ -225,11 +208,11 @@ class EuroParlWebstreamIE(InfoExtractor):
         return {
             'start_timestamp': current_time - 86400,  # 24 hours ago
             'end_timestamp': current_time,
-            'is_live_candidate': True,  # Assume it might be live if we can't parse the time
         }
 
     def _find_m3u8_in_webpage(self, webpage):
         """Look for m3u8 URLs directly in the webpage."""
+        # Look for direct m3u8 URLs with timestamps
         m3u8_matches = re.findall(
             r'[\'"]((https?://live\.media\.eup\.glcloud\.eu/[^"\']+\.m3u8(?:\?[^\'"]*)?)[\'"])',
             webpage
@@ -252,40 +235,6 @@ class EuroParlWebstreamIE(InfoExtractor):
         title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
         return title
 
-    def _check_is_live(self, webpage):
-        """Check if the stream is likely to be live based on webpage content."""
-        live_indicators = [
-            r'(?i)live\s+now',
-            r'(?i)streaming\s+live',
-            r'(?i)watch\s+live',
-            r'(?i)live\s+stream',
-            r'(?i)currently\s+live',
-            r'(?i)livestream',
-            r'isLive\s*[:=]\s*true',
-            r'"isLive"\s*:\s*true',
-            r'data-is-live\s*=\s*["\'](true|1)["\']',
-        ]
-        
-        for indicator in live_indicators:
-            if re.search(indicator, webpage):
-                return True
-        
-        return False
-
-    def _try_url(self, url, display_id):
-        """Try a single URL and return formats and subtitles if successful."""
-        try:
-            self.to_screen(f"Trying URL: {url}")
-            fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                url, display_id, 'mp4', m3u8_id='hls', fatal=False)
-            
-            if fmt:
-                return fmt, subs
-        except ExtractorError as e:
-            self.report_warning(f"Failed with URL {url}: {e}")
-        
-        return None, None
-
     def _real_extract(self, url):
         mobj = self._match_valid_url(url)
         display_id = mobj.group('id')
@@ -312,104 +261,102 @@ class EuroParlWebstreamIE(InfoExtractor):
         webpage = self._download_webpage(url, display_id)
         title = self._extract_title_from_webpage(webpage)
         
-        # Check if this is likely to be a live stream
-        is_live_page = self._check_is_live(webpage)
-        
         # First, look for m3u8 URLs directly in the page
         direct_urls = self._find_m3u8_in_webpage(webpage)
         if direct_urls:
             self.to_screen(f"Found {len(direct_urls)} potential stream URLs in webpage")
             for m3u8_url in direct_urls:
-                formats, subtitles = self._try_url(m3u8_url, display_id)
-                if formats:
-                    return {
-                        'id': display_id,
-                        'display_id': display_id,
-                        'title': title,
-                        'formats': formats,
-                        'subtitles': subtitles,
-                        'is_live': is_live_page,
-                    }
+                try:
+                    self.to_screen(f"Trying direct URL: {m3u8_url}")
+                    formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+                        m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                    
+                    if formats:
+                        return {
+                            'id': display_id,
+                            'display_id': display_id,
+                            'title': title,
+                            'formats': formats,
+                            'subtitles': subtitles,
+                        }
+                except ExtractorError as e:
+                    self.report_warning(f"Failed with direct URL {m3u8_url}: {e}")
         
-        # Parse the meeting ID and check if this is potentially a live stream
+        # If no direct URLs found, parse the meeting ID and generate likely timestamps
         meeting_info = self._parse_meeting_id(display_id)
         start_timestamp = meeting_info.get('start_timestamp')
         end_timestamp = meeting_info.get('end_timestamp')
-        is_live_candidate = meeting_info.get('is_live_candidate', False) or is_live_page
         
         self.to_screen(f"Generated timestamps for meeting: start={start_timestamp}, end={end_timestamp}")
-        self.to_screen(f"Stream is likely {'live' if is_live_candidate else 'archived'}")
         
-        # First check for live streams if this is a live candidate
-        if is_live_candidate:
-            self.to_screen("Checking for live stream URLs first")
-            
-            for endpoint in self._ENDPOINTS[:2]:  # Only try the first two endpoints for live
-                for channel in self._CHANNELS[:3]:  # Only try the top 3 channels for live
-                    for stream_type in self._LIVE_STREAM_IDS:
-                        # For live streams, try URLs without timestamps
-                        live_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8"
-                        formats, subtitles = self._try_url(live_url, display_id)
-                        
-                        if formats:
-                            return {
-                                'id': display_id,
-                                'display_id': display_id,
-                                'title': title,
-                                'formats': formats,
-                                'subtitles': subtitles,
-                                'is_live': True,
-                            }
+        # Try a variety of possibilities, starting with the most likely combinations
+        formats = []
+        subtitles = {}
+        working_url = None
         
-        # Try archived streams with prioritized channels
-        for channel in self._CHANNELS:
-            for stream_type in self._ARCHIVE_STREAM_IDS:
-                # For archived content, include timestamps
-                archive_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-                formats, subtitles = self._try_url(archive_url, display_id)
+        # Main endpoint with prioritized channels
+        for channel in self.PRIORITIZED_CHANNELS:
+            for stream_type in self.KNOWN_STREAM_IDS:
+                candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+                self.to_screen(f"Trying URL: {candidate_url}")
                 
-                if formats:
-                    return {
-                        'id': display_id,
-                        'display_id': display_id,
-                        'title': title,
-                        'formats': formats,
-                        'subtitles': subtitles,
-                        'is_live': False,
-                    }
-        
-        # If main endpoint + prioritized channels didn't work, try other endpoints
-        for endpoint in self._ENDPOINTS[1:]:
-            for channel in self._CHANNELS[:3]:  # Only try the top 3 channels for other endpoints
-                for stream_type in self._ARCHIVE_STREAM_IDS:
-                    archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-                    formats, subtitles = self._try_url(archive_url, display_id)
+                try:
+                    fmt, subs = self._extract_m3u8_formats_and_subtitles(
+                        candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
                     
-                    if formats:
+                    if fmt:
+                        formats.extend(fmt)
+                        self._merge_subtitles(subs, target=subtitles)
+                        working_url = candidate_url
+                        self.to_screen(f"Success! Found working URL: {working_url}")
+                        
                         return {
                             'id': display_id,
                             'display_id': display_id,
                             'title': title,
                             'formats': formats,
                             'subtitles': subtitles,
-                            'is_live': False,
                         }
+                except ExtractorError as e:
+                    self.report_warning(f"Failed with URL {candidate_url}: {e}")
+        
+        # If main endpoint + prioritized channels didn't work, try other endpoints
+        for endpoint in self.KNOWN_ENDPOINTS[1:]:  # Skip the first one as we already tried it
+            for channel in self.PRIORITIZED_CHANNELS[:3]:  # Only try the top 3 channels for other endpoints
+                for stream_type in self.KNOWN_STREAM_IDS:
+                    candidate_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
+                    self.to_screen(f"Trying URL: {candidate_url}")
+                    
+                    try:
+                        fmt, subs = self._extract_m3u8_formats_and_subtitles(
+                            candidate_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
+                        
+                        if fmt:
+                            formats.extend(fmt)
+                            self._merge_subtitles(subs, target=subtitles)
+                            working_url = candidate_url
+                            self.to_screen(f"Success! Found working URL: {working_url}")
+                            
+                            return {
+                                'id': display_id,
+                                'display_id': display_id,
+                                'title': title,
+                                'formats': formats,
+                                'subtitles': subtitles,
+                            }
+                    except ExtractorError as e:
+                        self.report_warning(f"Failed with URL {candidate_url}: {e}")
         
         # If we've reached here, we need to give a helpful error message
         parsed_date = f"{meeting_info.get('date', 'unknown-date')}"
         parsed_time = f"{meeting_info.get('time', 'unknown-time')}"
         
-        # Provide different suggestions based on whether it's likely live or archived
-        if is_live_candidate:
-            suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8"
-            suggestion_text = f"For live streams, try: yt-dlp \"{suggested_url}\""
-        else:
-            suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-            suggestion_text = f"For archived content, try: yt-dlp \"{suggested_url}\""
+        # Provide the most likely URL for manual use
+        suggested_url = f"https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
         
         raise ExtractorError(
             f"Could not extract stream URL for {display_id}. The European Parliament stream may not be available.\n"
-            f"Attempted to find a {'live' if is_live_candidate else 'archived'} stream for date: {parsed_date}, time: {parsed_time}.\n"
-            f"{suggestion_text}",
+            f"Attempted to find a stream for date: {parsed_date}, time: {parsed_time}.\n"
+            f"Try using yt-dlp directly with: yt-dlp \"{suggested_url}\"",
             expected=True
         )