diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index b40d393a79..cd4cbf4dfd 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import ( int_or_none, @@ -10,15 +9,7 @@ from ..utils import ( traverse_obj, unified_strdate, xpath_text, - ExtractorError, - js_to_json, - urljoin ) -import re -import json -import time -import datetime - class EuropaIE(InfoExtractor): _WORKING = False @@ -54,7 +45,10 @@ class EuropaIE(InfoExtractor): def get_item(type_, preference): items = {} for item in playlist.findall(f'./info/{type_}/item'): - lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) + lang, label = ( + xpath_text(item, 'lg', default=None), + xpath_text(item, 'label', default=None) + ) if lang and label: items[lang] = label.strip() for p in preference: @@ -63,7 +57,6 @@ class EuropaIE(InfoExtractor): query = parse_qs(url) preferred_lang = query.get('sitelang', ('en', ))[0] - preferred_langs = orderedSet((preferred_lang, 'en', 'int')) title = get_item('title', preferred_langs) or video_id @@ -102,320 +95,131 @@ class EuropaIE(InfoExtractor): class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?: - multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+)| - live\.media\.eup\.glcloud\.eu/hls/live/(?P[\w-]+)/(?Pchannel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows - ) + https?://multimedia\.europarl\.europa\.eu/ + (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) ''' _TESTS = [{ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { - 'id': '20220914-0900-PLENARY', + 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY', 'ext': 'mp4', 'title': 'Plenary session', + 'release_timestamp': 1663139069, + 'release_date': '20220914', }, 'params': { 'skip_download': True, }, }, { - # Direct HLS stream URL (archive example similar to user provided) - 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442', + # example of old live webstream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', 'info_dict': { - 'id': 'norsk-archive', # ID derived from filename before query 'ext': 'mp4', - 'title': 'European Parliament Stream', - }, - 'params': { - 'skip_download': True, - }, - },{ - # Direct HLS stream URL (live example) - 'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8', - 'info_dict': { - 'id': 'index', - 'ext': 'mp4', - 'title': 'European Parliament Stream', - }, - 'params': { - 'skip_download': True, + 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715', + 'release_timestamp': 1668502800, + 'title': 'Euroscola 2022-11-15 19:21', + 'release_date': '20221115', + 'live_status': 'is_live', }, + 'skip': 'not live anymore', }] - # Known CDN endpoints - try these if direct extraction fails - # Added 2113713 and 2113713-b based on user's M3U8 - ENDPOINTS = ["2113753", "2113713", "2113713-b"] - - # Priority channels based on observed success rates & user M3U8 - # Added channel-01-stb - PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"] - - # Default stream types/filenames by content type - # These are used in the *fallback* guessing logic. - # The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed. - LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"] - ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"] - - def _extract_direct_url_from_webpage(self, webpage): - """Extract direct m3u8 URLs from webpage with minimal logging""" - m3u8_urls = set() # Use a set to avoid duplicates - - # Search patterns for m3u8 URLs - # Added more flexibility for quotes and paths - for pattern in [ - r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']', - r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', - # Look for assignments or attributes - r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']', - # Look for URLs within JSON-like structures in script tags - r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', - r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"', - ]: - matches = re.findall(pattern, webpage) - for match in matches: - # Handle potential tuple results from findall if multiple groups exist in regex - url_match = match if isinstance(match, str) else match[0] - # Basic sanity check - if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match: - # Remove any JS string escaping - url_match = url_match.replace('\\/', '/').replace('\\\\', '\\') - m3u8_urls.add(url_match) - - # Extract from network panel if available (less reliable parsing) - network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE) - if network_url_match: - url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\') - m3u8_urls.add(url_match) - - self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage') - return list(m3u8_urls) - - def _extract_title_from_webpage(self, webpage, display_id): - """Extract the title from the webpage""" - # Try different patterns to extract the title - for pattern in [ - r'([^<]+)', - r']*class="erpl_title-h1"[^>]*>([^<]+)', # Specific title class - r']*>([^<]+)', - r'"title"\s*:\s*"([^"]+)"', - ]: - title_match = re.search(pattern, webpage) - if title_match: - title = title_match.group(1).strip() - # Clean up common suffixes - title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip() - title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip() - if title: - return title - - return f"European Parliament Session - {display_id}" # Fallback title - - def _parse_meeting_date(self, display_id): - """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)""" - date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id) - if date_match: - date_str, time_str, _ = date_match.groups() - try: - # Parse the date components - year = int(date_str[:4]) - month = int(date_str[4:6]) - day = int(date_str[6:8]) - hour = int(time_str[:2]) - minute = int(time_str[2:4]) - - # Create timestamps with a generous window (e.g., 3 hours before, 6 hours after) - # This helps catch streams that start slightly early or run long - meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC - start_dt = meeting_dt - datetime.timedelta(hours=3) - end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window - - # Convert to Unix timestamps - start_ts = int(start_dt.timestamp()) - end_ts = int(end_dt.timestamp()) - - self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}') - return start_ts, end_ts - - except (ValueError, OverflowError) as e: - self.to_screen(f'Error parsing date from display_id "{display_id}": {e}') - pass # Fall through to fallback - - # Fallback to a recent window if parsing fails or ID format is different - self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.') - now = int(time.time()) - start_time = now - (24 * 3600) # 24 hours ago (might be too short for older archives) - end_time = now + (1 * 3600) # 1 hour in the future (for live/recent) - return start_time, end_time - def _real_extract(self, url): - mobj = self._match_valid_url(url) - # Get potential IDs from the regex match groups - display_id = mobj.group('id') - live_id = mobj.group('live_id') - stream_id = mobj.group('stream_id') - channel = mobj.group('channel') - - # Use the most specific ID available - video_id = display_id or stream_id or live_id or channel - - # Handle direct HLS URLs first (most reliable if provided) - if live_id and (stream_id or channel): - # Clean up stream_id (remove query parameters for use as info dict id) - clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id - # If stream_id is missing but channel exists, use channel as part of the id - final_id = clean_stream_id or channel or 'unknown_stream' - # Remove potential .m3u8 suffix for cleaner ID - if final_id.endswith('.m3u8'): - final_id = final_id[:-5] - - self.to_screen(f'Processing direct HLS URL: {url}') - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - if not formats: - self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}') - # Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work - # raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True) + # Try to parse Next.js data for metadata + nextjs = self._search_nextjs_data(webpage, display_id, default={}) + page_props = traverse_obj(nextjs, ('props', 'pageProps'), default={}) + media_info = page_props.get('mediaItem') or {} # Look for start/end times here for archives? + + title = media_info.get('title') or media_info.get('name') or display_id + release_timestamp = None + # Existing logic uses startDateTime, might need adjustment for archive start/end + if 'startDateTime' in media_info: + release_timestamp = parse_iso8601(media_info['startDateTime']) + + # Determine if it's Live or VOD/Archive (might need refinement) + # mediaSubType might be 'Live' or 'VOD' or something else + is_live = media_info.get('mediaSubType') == 'Live' + + # Search for any .m3u8 link first + m3u8_links = self._search_regex( + r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', + webpage, 'm3u8 URL', default=None, group=1, fatal=False + ) + # --- Potential modification area START --- + # If it's NOT live, and we have start/end times, and m3u8_links points to a live URL, + # try constructing the index-archive.m3u8 URL here. + # Example (conceptual - requires actual start/end times and base URL logic): + # if not is_live and media_info.get('startTime') and media_info.get('endTime'): + # start_time = media_info['startTime'] # Assuming these keys exist and hold timestamps + # end_time = media_info['endTime'] + # # Assuming m3u8_links contains a base URL that needs modification + # base_url = m3u8_links.split('/')[0:-1] # Highly simplified base URL extraction + # archive_url = '/'.join(base_url) + f'/index-archive.m3u8?startTime={start_time}&endTime={end_time}' + # m3u8_links = archive_url # Replace the found link with the constructed one + # --- Potential modification area END --- + + + if not m3u8_links: + self.report_warning('Could not find any .m3u8 link in the page. The site structure may have changed.') + # Return basic info if no HLS manifest found return { - 'id': final_id, - 'title': 'European Parliament Stream', # Generic title for direct URLs - 'formats': formats or [], - 'subtitles': subtitles or {}, - 'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL + 'id': media_info.get('id') or display_id, + 'display_id': display_id, + 'title': title, + 'release_timestamp': release_timestamp, + 'formats': [], } - # --- Fallback for multimedia.europarl.europa.eu URLs --- - if not display_id: # Should have display_id if it's not a direct HLS URL - raise ExtractorError('Failed to identify video ID from URL.') - - self.to_screen(f'Processing webpage URL: {url}') - webpage = self._download_webpage(url, display_id) - - # Check for live indicators more reliably - # Look for common live indicators in JS, classes, or text - is_live = bool(re.search( - r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)', - webpage, - re.IGNORECASE)) - self.to_screen(f'Detected as live: {is_live}') - - # Extract title - title = self._extract_title_from_webpage(webpage, display_id) - - # *** Strategy 1: Extract direct URLs from webpage (Preferred) *** - direct_urls = self._extract_direct_url_from_webpage(webpage) - formats = [] - subtitles = {} - - if direct_urls: - self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...') - for m3u8_url in direct_urls: - # Clean stream ID from URL for format identification - m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0] - if m3u8_stream_id.endswith('.m3u8'): - m3u8_stream_id = m3u8_stream_id[:-5] - - try: - fmt, subs = self._extract_m3u8_formats_and_subtitles( - m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error - - if fmt: - self.to_screen(f'Successfully extracted formats from: {m3u8_url}') - formats.extend(fmt) - self._merge_subtitles(subs, target=subtitles) - # If we found formats, we are likely done, return immediately - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL - } - else: - self.to_screen(f'No formats found in: {m3u8_url}') - except ExtractorError as e: - self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}') - pass # Try the next direct URL - else: - self.to_screen('No direct M3U8 URLs found in webpage.') - - - # *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) *** - self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...') - - # Parse timestamps for archive retrieval (or use a window for live/unknown) - # Always parse, even if live, as it might be a recently finished live event - start_timestamp, end_timestamp = self._parse_meeting_date(display_id) - - # Use appropriate stream filenames for the content type - stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES - - # Try combinations with updated endpoints and channels - for endpoint in self.ENDPOINTS: - for channel_to_try in self.PRIORITY_CHANNELS: - for filename in stream_filenames: - base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}" - - # Determine if timestamps should be added - # Add timestamps if it's explicitly not live, OR if the filename suggests archive, - # OR if start/end timestamps were successfully parsed from the ID. - # Avoid timestamps for clearly live filenames unless forced by non-live status. - use_timestamps = ( - (not is_live or 'archive' in filename.lower()) - and start_timestamp and end_timestamp - ) - - test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url - - try: - self.to_screen(f'Trying guessed URL: {test_url}') - fmt, subs = self._extract_m3u8_formats_and_subtitles( - test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False) - - if fmt: - self.to_screen(f'Success with guessed URL: {test_url}') - formats.extend(fmt) - self._merge_subtitles(subs, target=subtitles) - # Found a working combination - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': not use_timestamps, # If we used timestamps, assume not live - } - else: - self.to_screen(f'No formats found in guessed URL: {test_url}') - - except ExtractorError as e: - # Log error lightly, as many guesses are expected to fail - self.to_screen(f'Guessed URL failed: {test_url} ({e})') - pass # Continue trying other combinations - - # *** If all strategies fail *** - self.to_screen('All extraction strategies failed.') - - # Provide helpful error with suggestions - error_message = ( - f"Could not extract stream URL for {display_id or url}. " - "The stream may be old, expired, or use an unsupported format.\n" - f"Live status detected: {is_live}\n" - "Common issues:\n" - "- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n" - "- The event might not be available via the standard CDN endpoints/channels.\n" - "If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n" - "Example (using parsed times, adjust if needed):\n" + # Process all found .m3u8 links (handles case where multiple are found or the first one is a master playlist) + # The regex used here is identical to the one above, ensures we capture all instances + import re + all_links_text = self._html_search_regex( + r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', + webpage, 'all m3u8 URLs', default='', fatal=False, group=0 # Find all occurrences ) - if start_timestamp and end_timestamp: - example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}" - error_message += f'yt-dlp "{example_url}"' - else: - example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8" - error_message += f'yt-dlp "{example_url}"' + candidates = re.findall(r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', all_links_text) + + # If the specific constructed URL was made above, ensure it's prioritized or the only candidate + # (Refined logic needed here based on the modification above) + if not candidates and m3u8_links: # Fallback if findall failed but initial search worked + candidates = [m3u8_links] + elif m3u8_links not in candidates and m3u8_links: # Ensure the primary (possibly constructed) link is included + candidates.insert(0, m3u8_links) + + candidates = list(dict.fromkeys(candidates)) # Make unique, preserving order + + if not candidates: # Final check if still no candidates + self.report_warning('Could not extract any valid .m3u8 URLs.') + return { + 'id': media_info.get('id') or display_id, + 'display_id': display_id, + 'title': title, + 'release_timestamp': release_timestamp, + 'formats': [], + } + + + formats, subtitles = [], {} + for link in candidates: + # Pass the identified m3u8 URL (could be live, index-archive, or norsk-archive) + # The 'live' flag might need adjustment based on mediaSubType + fmts, subs = self._extract_m3u8_formats_and_subtitles( + link, display_id, ext='mp4', live=is_live, fatal=False) # Pass is_live status + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) - - raise ExtractorError(error_message, expected=True) + return { + 'id': media_info.get('id') or display_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'release_timestamp': release_timestamp, + # Report 'is_live' based on detected mediaSubType + 'is_live': is_live or None # Report None if not explicitly Live + }