From fe08c6ca27701c8199300f48c9320fd4f20584fb Mon Sep 17 00:00:00 2001 From: edmundman <45210014+edmundman@users.noreply.github.com> Date: Fri, 28 Mar 2025 12:49:15 +0000 Subject: [PATCH] Update europa.py --- yt_dlp/extractor/europa.py | 351 ++++++++++++++++++++++--------------- 1 file changed, 205 insertions(+), 146 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 8dd0ddcd88..b40d393a79 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -1,3 +1,4 @@ +# coding: utf-8 from .common import InfoExtractor from ..utils import ( int_or_none, @@ -103,7 +104,7 @@ class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?: multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| - live\.media\.eup\.glcloud\.eu/hls/live/(?P\d+)/(?Pchannel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w-]+)(?:\.m3u8|/master\.m3u8) + live\.media\.eup\.glcloud\.eu/hls/live/(?P[\w-]+)/(?Pchannel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows ) ''' _TESTS = [{ @@ -118,10 +119,21 @@ class EuroParlWebstreamIE(InfoExtractor): 'skip_download': True, }, }, { - # Direct HLS stream URL - 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index-archive.m3u8?startTime=1742828675&endTime=1742832870', + # Direct HLS stream URL (archive example similar to user provided) + 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442', 'info_dict': { - 'id': 'index-archive', + 'id': 'norsk-archive', # ID derived from filename before query + 'ext': 'mp4', + 'title': 'European Parliament Stream', + }, + 'params': { + 'skip_download': True, + }, + },{ + # Direct HLS stream URL (live example) + 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8', + 'info_dict': { + 'id': 'index', 'ext': 'mp4', 'title': 'European Parliament Stream', }, @@ -130,43 +142,53 @@ class EuroParlWebstreamIE(InfoExtractor): }, }] - # Main CDN endpoint - primarily target this instead of trying multiple - MAIN_ENDPOINT = "2113753" - - # Priority channels based on observed success rates - PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-bxl", "channel-10-bxl"] - - # Default stream types by content type - LIVE_STREAM_TYPES = ["index", "master", "playlist"] - ARCHIVE_STREAM_TYPES = ["index-archive", "norsk-archive", "index", "master"] + # Known CDN endpoints - try these if direct extraction fails + # Added 2113713 and 2113713-b based on user's M3U8 + ENDPOINTS = ["2113753", "2113713", "2113713-b"] + + # Priority channels based on observed success rates & user M3U8 + # Added channel-01-stb + PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"] + + # Default stream types/filenames by content type + # These are used in the *fallback* guessing logic. + # The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed. + LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"] + ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"] def _extract_direct_url_from_webpage(self, webpage): """Extract direct m3u8 URLs from webpage with minimal logging""" - m3u8_urls = [] - + m3u8_urls = set() # Use a set to avoid duplicates + # Search patterns for m3u8 URLs + # Added more flexibility for quotes and paths for pattern in [ - r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\']*)?)["\']', + r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']', r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', - r'=[^\n]*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']', + # Look for assignments or attributes + r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']', + # Look for URLs within JSON-like structures in script tags + r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', + r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', ]: matches = re.findall(pattern, webpage) - if matches: - m3u8_urls.extend(matches) - - # Clean up URLs - clean_urls = [] - for url in m3u8_urls: - # Remove any JS string escaping - url = url.replace('\\/', '/').replace('\\\\', '\\') - clean_urls.append(url) - - # Extract from network panel if available + for match in matches: + # Handle potential tuple results from findall if multiple groups exist in regex + url_match = match if isinstance(match, str) else match[0] + # Basic sanity check + if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match: + # Remove any JS string escaping + url_match = url_match.replace('\\/', '/').replace('\\\\', '\\') + m3u8_urls.add(url_match) + + # Extract from network panel if available (less reliable parsing) network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE) if network_url_match: - clean_urls.append(network_url_match.group(1)) - - return clean_urls + url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\') + m3u8_urls.add(url_match) + + self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage') + return list(m3u8_urls) def _extract_title_from_webpage(self, webpage, display_id): """Extract the title from the webpage""" @@ -174,6 +196,7 @@ class EuroParlWebstreamIE(InfoExtractor): for pattern in [ r'([^<]+)', + r']*class="erpl_title-h1"[^>]*>([^<]+)', # Specific title class r']*>([^<]+)', r'"title"\s*:\s*"([^"]+)"', ]: @@ -181,17 +204,18 @@ class EuroParlWebstreamIE(InfoExtractor): if title_match: title = title_match.group(1).strip() # Clean up common suffixes - title = re.sub(r'\s*\|\s*European Parliament$', '', title) - title = re.sub(r'\s*-\s*Multimedia Centre$', '', title) - return title - - return f"European Parliament Session - {display_id}" + title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip() + title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip() + if title: + return title + + return f"European Parliament Session - {display_id}" # Fallback title def _parse_meeting_date(self, display_id): """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)""" date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id) if date_match: - date_str, time_str, meeting_type = date_match.groups() + date_str, time_str, _ = date_match.groups() try: # Parse the date components year = int(date_str[:4]) @@ -199,164 +223,199 @@ class EuroParlWebstreamIE(InfoExtractor): day = int(date_str[6:8]) hour = int(time_str[:2]) minute = int(time_str[2:4]) - - # Create timestamps with a generous window (3 hours before and after) - meeting_dt = datetime.datetime(year, month, day, hour, minute) + + # Create timestamps with a generous window (e.g., 3 hours before, 6 hours after) + # This helps catch streams that start slightly early or run long + meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC start_dt = meeting_dt - datetime.timedelta(hours=3) - end_dt = meeting_dt + datetime.timedelta(hours=6) - - # Convert to timestamps + end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window + + # Convert to Unix timestamps start_ts = int(start_dt.timestamp()) end_ts = int(end_dt.timestamp()) - + + self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}') return start_ts, end_ts - - except (ValueError, OverflowError): - pass - - # Fallback to a recent 48-hour window + + except (ValueError, OverflowError) as e: + self.to_screen(f'Error parsing date from display_id "{display_id}": {e}') + pass # Fall through to fallback + + # Fallback to a recent window if parsing fails or ID format is different + self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.') now = int(time.time()) - start_time = now - (48 * 3600) # 48 hours ago - return start_time, now + start_time = now - (24 * 3600) # 24 hours ago (might be too short for older archives) + end_time = now + (1 * 3600) # 1 hour in the future (for live/recent) + return start_time, end_time def _real_extract(self, url): mobj = self._match_valid_url(url) + # Get potential IDs from the regex match groups display_id = mobj.group('id') live_id = mobj.group('live_id') stream_id = mobj.group('stream_id') channel = mobj.group('channel') - # Handle direct HLS URLs - if live_id and stream_id: - # Remove query parameters from stream_id if present - clean_stream_id = stream_id.split('?')[0] if '?' in stream_id else stream_id - + # Use the most specific ID available + video_id = display_id or stream_id or live_id or channel + + # Handle direct HLS URLs first (most reliable if provided) + if live_id and (stream_id or channel): + # Clean up stream_id (remove query parameters for use as info dict id) + clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id + # If stream_id is missing but channel exists, use channel as part of the id + final_id = clean_stream_id or channel or 'unknown_stream' + # Remove potential .m3u8 suffix for cleaner ID + if final_id.endswith('.m3u8'): + final_id = final_id[:-5] + + self.to_screen(f'Processing direct HLS URL: {url}') formats, subtitles = self._extract_m3u8_formats_and_subtitles( - url, clean_stream_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) - + url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues + + if not formats: + self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}') + # Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work + # raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True) + return { - 'id': clean_stream_id, - 'title': 'European Parliament Stream', - 'formats': formats, - 'subtitles': subtitles, + 'id': final_id, + 'title': 'European Parliament Stream', # Generic title for direct URLs + 'formats': formats or [], + 'subtitles': subtitles or {}, + 'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL } - # Download the webpage for standard europarl URLs + # --- Fallback for multimedia.europarl.europa.eu URLs --- + if not display_id: # Should have display_id if it's not a direct HLS URL + raise ExtractorError('Failed to identify video ID from URL.') + + self.to_screen(f'Processing webpage URL: {url}') webpage = self._download_webpage(url, display_id) - - # Check for live indicators - is_live = bool(re.search(r'(?:isLive|livestream|live-stream|\"live\"\s*:\s*true)', webpage, re.IGNORECASE)) - + + # Check for live indicators more reliably + # Look for common live indicators in JS, classes, or text + is_live = bool(re.search( + r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)', + webpage, + re.IGNORECASE)) + self.to_screen(f'Detected as live: {is_live}') + # Extract title title = self._extract_title_from_webpage(webpage, display_id) - - # First try direct URLs from the webpage (this is the most reliable approach) + + # *** Strategy 1: Extract direct URLs from webpage (Preferred) *** direct_urls = self._extract_direct_url_from_webpage(webpage) - - # Track whether we successfully found a stream formats = [] subtitles = {} - + if direct_urls: + self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...') for m3u8_url in direct_urls: + # Clean stream ID from URL for format identification + m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0] + if m3u8_stream_id.endswith('.m3u8'): + m3u8_stream_id = m3u8_stream_id[:-5] + try: fmt, subs = self._extract_m3u8_formats_and_subtitles( - m3u8_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - + m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error + if fmt: + self.to_screen(f'Successfully extracted formats from: {m3u8_url}') formats.extend(fmt) self._merge_subtitles(subs, target=subtitles) - + # If we found formats, we are likely done, return immediately return { 'id': display_id, 'display_id': display_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'is_live': is_live, + 'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL } - except ExtractorError: - pass - - # Parse timestamps for archive retrieval (or use current time for live) - if is_live: - # For live streams, we don't need timestamps - start_timestamp, end_timestamp = None, None + else: + self.to_screen(f'No formats found in: {m3u8_url}') + except ExtractorError as e: + self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}') + pass # Try the next direct URL else: - start_timestamp, end_timestamp = self._parse_meeting_date(display_id) - - # Use appropriate stream types for the content type - stream_types = self.LIVE_STREAM_TYPES if is_live else self.ARCHIVE_STREAM_TYPES - - # Try combinations with improved targeting - for channel in self.PRIORITY_CHANNELS: - for stream_type in stream_types: - # For live streams, try without timestamps first - if is_live: - live_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8" - - try: - fmt, subs = self._extract_m3u8_formats_and_subtitles( - live_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - - if fmt: - formats.extend(fmt) - self._merge_subtitles(subs, target=subtitles) - - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': True, - } - except ExtractorError: - pass - - # For archived content (or as fallback for live), try with timestamps - if start_timestamp and end_timestamp: - archive_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/{channel}/{stream_type}.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - + self.to_screen('No direct M3U8 URLs found in webpage.') + + + # *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) *** + self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...') + + # Parse timestamps for archive retrieval (or use a window for live/unknown) + # Always parse, even if live, as it might be a recently finished live event + start_timestamp, end_timestamp = self._parse_meeting_date(display_id) + + # Use appropriate stream filenames for the content type + stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES + + # Try combinations with updated endpoints and channels + for endpoint in self.ENDPOINTS: + for channel_to_try in self.PRIORITY_CHANNELS: + for filename in stream_filenames: + base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}" + + # Determine if timestamps should be added + # Add timestamps if it's explicitly not live, OR if the filename suggests archive, + # OR if start/end timestamps were successfully parsed from the ID. + # Avoid timestamps for clearly live filenames unless forced by non-live status. + use_timestamps = ( + (not is_live or 'archive' in filename.lower()) + and start_timestamp and end_timestamp + ) + + test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url + try: + self.to_screen(f'Trying guessed URL: {test_url}') fmt, subs = self._extract_m3u8_formats_and_subtitles( - archive_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - + test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False) + if fmt: + self.to_screen(f'Success with guessed URL: {test_url}') formats.extend(fmt) self._merge_subtitles(subs, target=subtitles) - + # Found a working combination return { 'id': display_id, 'display_id': display_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'is_live': False, + 'is_live': not use_timestamps, # If we used timestamps, assume not live } - except ExtractorError: - pass - - # Provide helpful error with the most likely working URLs - suggested_urls = [] - - # Add the URLs that are most likely to work based on the logs and screenshots + else: + self.to_screen(f'No formats found in guessed URL: {test_url}') + + except ExtractorError as e: + # Log error lightly, as many guesses are expected to fail + self.to_screen(f'Guessed URL failed: {test_url} ({e})') + pass # Continue trying other combinations + + # *** If all strategies fail *** + self.to_screen('All extraction strategies failed.') + + # Provide helpful error with suggestions + error_message = ( + f"Could not extract stream URL for {display_id or url}. " + "The stream may be old, expired, or use an unsupported format.\n" + f"Live status detected: {is_live}\n" + "Common issues:\n" + "- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n" + "- The event might not be available via the standard CDN endpoints/channels.\n" + "If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n" + "Example (using parsed times, adjust if needed):\n" + ) if start_timestamp and end_timestamp: - suggested_urls.extend([ - f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}", - f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - ]) + example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" + error_message += f'yt-dlp "{example_url}"' else: - suggested_urls.extend([ - f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-07-bxl/index.m3u8", - f"https://live.media.eup.glcloud.eu/hls/live/{self.MAIN_ENDPOINT}/channel-01-bxl/index.m3u8" - ]) - - suggestions = "\n".join([f"yt-dlp \"{url}\"" for url in suggested_urls]) - - raise ExtractorError( - f"Could not extract stream URL for {display_id or url}. The European Parliament stream may not be available.\n" - f"Live stream detected: {is_live}\n" - f"Try using yt-dlp directly with one of these URLs:\n{suggestions}", - expected=True - ) + example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8" + error_message += f'yt-dlp "{example_url}"' + + + raise ExtractorError(error_message, expected=True)