Update europa.py

6 months ago · 6e3ddbbe4d
parent fe08c6ca27
commit 6e3ddbbe4d
1 changed files with 107 additions and 303 deletions
--- a/yt_dlp/extractor/europa.py
+++ b/yt_dlp/extractor/europa.py
@ -1,4 +1,3 @@
-# coding: utf-8
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
@ -10,15 +9,7 @@ from ..utils import (
    traverse_obj,
    unified_strdate,
    xpath_text,
-    ExtractorError,
-    js_to_json,
-    urljoin
 )
-import re
-import json
-import time
-import datetime
-

 class EuropaIE(InfoExtractor):
    _WORKING = False
@ -54,7 +45,10 @@ class EuropaIE(InfoExtractor):
        def get_item(type_, preference):
            items = {}
            for item in playlist.findall(f'./info/{type_}/item'):
-                lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
+                lang, label = (
+                    xpath_text(item, 'lg', default=None),
+                    xpath_text(item, 'label', default=None)
+                )
                if lang and label:
                    items[lang] = label.strip()
            for p in preference:
@ -63,7 +57,6 @@ class EuropaIE(InfoExtractor):

        query = parse_qs(url)
        preferred_lang = query.get('sitelang', ('en', ))[0]
-
        preferred_langs = orderedSet((preferred_lang, 'en', 'int'))

        title = get_item('title', preferred_langs) or video_id
@ -102,320 +95,131 @@ class EuropaIE(InfoExtractor):

 class EuroParlWebstreamIE(InfoExtractor):
    _VALID_URL = r'''(?x)
-        https?://(?:
-            multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)|
-            live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>[\w-]+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows
-        )
+        https?://multimedia\.europarl\.europa\.eu/
+        (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)
    '''
    _TESTS = [{
        'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
        'info_dict': {
-            'id': '20220914-0900-PLENARY',
+            'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d',
            'display_id': '20220914-0900-PLENARY',
            'ext': 'mp4',
            'title': 'Plenary session',
+            'release_timestamp': 1663139069,
+            'release_date': '20220914',
        },
        'params': {
            'skip_download': True,
        },
    }, {
-        # Direct HLS stream URL (archive example similar to user provided)
-        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442',
+        # example of old live webstream
+        'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA',
        'info_dict': {
-            'id': 'norsk-archive', # ID derived from filename before query
            'ext': 'mp4',
-            'title': 'European Parliament Stream',
-        },
-        'params': {
-            'skip_download': True,
-        },
-    },{
-        # Direct HLS stream URL (live example)
-        'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8',
-        'info_dict': {
-            'id': 'index',
-            'ext': 'mp4',
-            'title': 'European Parliament Stream',
-        },
-        'params': {
-            'skip_download': True,
+            'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715',
+            'release_timestamp': 1668502800,
+            'title': 'Euroscola 2022-11-15 19:21',
+            'release_date': '20221115',
+            'live_status': 'is_live',
        },
+        'skip': 'not live anymore',
    }]

-    # Known CDN endpoints - try these if direct extraction fails
-    # Added 2113713 and 2113713-b based on user's M3U8
-    ENDPOINTS = ["2113753", "2113713", "2113713-b"]
-
-    # Priority channels based on observed success rates & user M3U8
-    # Added channel-01-stb
-    PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"]
-
-    # Default stream types/filenames by content type
-    # These are used in the *fallback* guessing logic.
-    # The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed.
-    LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"]
-    ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"]
-
-    def _extract_direct_url_from_webpage(self, webpage):
-        """Extract direct m3u8 URLs from webpage with minimal logging"""
-        m3u8_urls = set() # Use a set to avoid duplicates
-
-        # Search patterns for m3u8 URLs
-        # Added more flexibility for quotes and paths
-        for pattern in [
-            r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']',
-            r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
-            # Look for assignments or attributes
-            r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']',
-            # Look for URLs within JSON-like structures in script tags
-            r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
-            r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
-        ]:
-            matches = re.findall(pattern, webpage)
-            for match in matches:
-                # Handle potential tuple results from findall if multiple groups exist in regex
-                url_match = match if isinstance(match, str) else match[0]
-                # Basic sanity check
-                if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match:
-                    # Remove any JS string escaping
-                    url_match = url_match.replace('\\/', '/').replace('\\\\', '\\')
-                    m3u8_urls.add(url_match)
-
-        # Extract from network panel if available (less reliable parsing)
-        network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE)
-        if network_url_match:
-            url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\')
-            m3u8_urls.add(url_match)
-
-        self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage')
-        return list(m3u8_urls)
-
-    def _extract_title_from_webpage(self, webpage, display_id):
-        """Extract the title from the webpage"""
-        # Try different patterns to extract the title
-        for pattern in [
-            r'<meta property="og:title" content="([^"]+)"',
-            r'<title>([^<]+)</title>',
-            r'<h1[^>]*class="erpl_title-h1"[^>]*>([^<]+)</h1>', # Specific title class
-            r'<h1[^>]*>([^<]+)</h1>',
-            r'"title"\s*:\s*"([^"]+)"',
-        ]:
-            title_match = re.search(pattern, webpage)
-            if title_match:
-                title = title_match.group(1).strip()
-                # Clean up common suffixes
-                title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
-                title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip()
-                if title:
-                    return title
-
-        return f"European Parliament Session - {display_id}" # Fallback title
-
-    def _parse_meeting_date(self, display_id):
-        """Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)"""
-        date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
-        if date_match:
-            date_str, time_str, _ = date_match.groups()
-            try:
-                # Parse the date components
-                year = int(date_str[:4])
-                month = int(date_str[4:6])
-                day = int(date_str[6:8])
-                hour = int(time_str[:2])
-                minute = int(time_str[2:4])
-
-                # Create timestamps with a generous window (e.g., 3 hours before, 6 hours after)
-                # This helps catch streams that start slightly early or run long
-                meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC
-                start_dt = meeting_dt - datetime.timedelta(hours=3)
-                end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window
-
-                # Convert to Unix timestamps
-                start_ts = int(start_dt.timestamp())
-                end_ts = int(end_dt.timestamp())
-
-                self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}')
-                return start_ts, end_ts
-
-            except (ValueError, OverflowError) as e:
-                 self.to_screen(f'Error parsing date from display_id "{display_id}": {e}')
-                 pass # Fall through to fallback
-
-        # Fallback to a recent window if parsing fails or ID format is different
-        self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.')
-        now = int(time.time())
-        start_time = now - (24 * 3600)  # 24 hours ago (might be too short for older archives)
-        end_time = now + (1 * 3600)      # 1 hour in the future (for live/recent)
-        return start_time, end_time
-
    def _real_extract(self, url):
-        mobj = self._match_valid_url(url)
-        # Get potential IDs from the regex match groups
-        display_id = mobj.group('id')
-        live_id = mobj.group('live_id')
-        stream_id = mobj.group('stream_id')
-        channel = mobj.group('channel')
-
-        # Use the most specific ID available
-        video_id = display_id or stream_id or live_id or channel
-
-        # Handle direct HLS URLs first (most reliable if provided)
-        if live_id and (stream_id or channel):
-            # Clean up stream_id (remove query parameters for use as info dict id)
-            clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id
-            # If stream_id is missing but channel exists, use channel as part of the id
-            final_id = clean_stream_id or channel or 'unknown_stream'
-            # Remove potential .m3u8 suffix for cleaner ID
-            if final_id.endswith('.m3u8'):
-                 final_id = final_id[:-5]
-
-            self.to_screen(f'Processing direct HLS URL: {url}')
-            formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-                url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)

-            if not formats:
-                 self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}')
-                 # Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work
-                 # raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True)
+        # Try to parse Next.js data for metadata
+        nextjs = self._search_nextjs_data(webpage, display_id, default={})
+        page_props = traverse_obj(nextjs, ('props', 'pageProps'), default={})
+        media_info = page_props.get('mediaItem') or {} # Look for start/end times here for archives?
+
+        title = media_info.get('title') or media_info.get('name') or display_id
+        release_timestamp = None
+        # Existing logic uses startDateTime, might need adjustment for archive start/end
+        if 'startDateTime' in media_info:
+             release_timestamp = parse_iso8601(media_info['startDateTime'])
+
+        # Determine if it's Live or VOD/Archive (might need refinement)
+        # mediaSubType might be 'Live' or 'VOD' or something else
+        is_live = media_info.get('mediaSubType') == 'Live'
+
+        # Search for any .m3u8 link first
+        m3u8_links = self._search_regex(
+            r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)',
+            webpage, 'm3u8 URL', default=None, group=1, fatal=False
+        )

+        # --- Potential modification area START ---
+        # If it's NOT live, and we have start/end times, and m3u8_links points to a live URL,
+        # try constructing the index-archive.m3u8 URL here.
+        # Example (conceptual - requires actual start/end times and base URL logic):
+        # if not is_live and media_info.get('startTime') and media_info.get('endTime'):
+        #     start_time = media_info['startTime'] # Assuming these keys exist and hold timestamps
+        #     end_time = media_info['endTime']
+        #     # Assuming m3u8_links contains a base URL that needs modification
+        #     base_url = m3u8_links.split('/')[0:-1] # Highly simplified base URL extraction
+        #     archive_url = '/'.join(base_url) + f'/index-archive.m3u8?startTime={start_time}&endTime={end_time}'
+        #     m3u8_links = archive_url # Replace the found link with the constructed one
+        # --- Potential modification area END ---
+
+
+        if not m3u8_links:
+            self.report_warning('Could not find any .m3u8 link in the page. The site structure may have changed.')
+            # Return basic info if no HLS manifest found
            return {
-                'id': final_id,
-                'title': 'European Parliament Stream', # Generic title for direct URLs
-                'formats': formats or [],
-                'subtitles': subtitles or {},
-                'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL
+                'id': media_info.get('id') or display_id,
+                'display_id': display_id,
+                'title': title,
+                'release_timestamp': release_timestamp,
+                'formats': [],
            }

-        # --- Fallback for multimedia.europarl.europa.eu URLs ---
-        if not display_id: # Should have display_id if it's not a direct HLS URL
-             raise ExtractorError('Failed to identify video ID from URL.')
-
-        self.to_screen(f'Processing webpage URL: {url}')
-        webpage = self._download_webpage(url, display_id)
-
-        # Check for live indicators more reliably
-        # Look for common live indicators in JS, classes, or text
-        is_live = bool(re.search(
-            r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)',
-            webpage,
-            re.IGNORECASE))
-        self.to_screen(f'Detected as live: {is_live}')
-
-        # Extract title
-        title = self._extract_title_from_webpage(webpage, display_id)
-
-        # *** Strategy 1: Extract direct URLs from webpage (Preferred) ***
-        direct_urls = self._extract_direct_url_from_webpage(webpage)
-        formats = []
-        subtitles = {}
-
-        if direct_urls:
-            self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...')
-            for m3u8_url in direct_urls:
-                # Clean stream ID from URL for format identification
-                m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0]
-                if m3u8_stream_id.endswith('.m3u8'):
-                    m3u8_stream_id = m3u8_stream_id[:-5]
-
-                try:
-                    fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                        m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error
-
-                    if fmt:
-                        self.to_screen(f'Successfully extracted formats from: {m3u8_url}')
-                        formats.extend(fmt)
-                        self._merge_subtitles(subs, target=subtitles)
-                        # If we found formats, we are likely done, return immediately
-                        return {
-                            'id': display_id,
-                            'display_id': display_id,
-                            'title': title,
-                            'formats': formats,
-                            'subtitles': subtitles,
-                            'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL
-                        }
-                    else:
-                        self.to_screen(f'No formats found in: {m3u8_url}')
-                except ExtractorError as e:
-                    self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}')
-                    pass # Try the next direct URL
-        else:
-            self.to_screen('No direct M3U8 URLs found in webpage.')
-
-
-        # *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) ***
-        self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...')
-
-        # Parse timestamps for archive retrieval (or use a window for live/unknown)
-        # Always parse, even if live, as it might be a recently finished live event
-        start_timestamp, end_timestamp = self._parse_meeting_date(display_id)
-
-        # Use appropriate stream filenames for the content type
-        stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES
-
-        # Try combinations with updated endpoints and channels
-        for endpoint in self.ENDPOINTS:
-            for channel_to_try in self.PRIORITY_CHANNELS:
-                for filename in stream_filenames:
-                    base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}"
-
-                    # Determine if timestamps should be added
-                    # Add timestamps if it's explicitly not live, OR if the filename suggests archive,
-                    # OR if start/end timestamps were successfully parsed from the ID.
-                    # Avoid timestamps for clearly live filenames unless forced by non-live status.
-                    use_timestamps = (
-                        (not is_live or 'archive' in filename.lower())
-                        and start_timestamp and end_timestamp
-                    )
-
-                    test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url
-
-                    try:
-                        self.to_screen(f'Trying guessed URL: {test_url}')
-                        fmt, subs = self._extract_m3u8_formats_and_subtitles(
-                            test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False)
-
-                        if fmt:
-                            self.to_screen(f'Success with guessed URL: {test_url}')
-                            formats.extend(fmt)
-                            self._merge_subtitles(subs, target=subtitles)
-                            # Found a working combination
-                            return {
-                                'id': display_id,
-                                'display_id': display_id,
-                                'title': title,
-                                'formats': formats,
-                                'subtitles': subtitles,
-                                'is_live': not use_timestamps, # If we used timestamps, assume not live
-                            }
-                        else:
-                            self.to_screen(f'No formats found in guessed URL: {test_url}')
-
-                    except ExtractorError as e:
-                        # Log error lightly, as many guesses are expected to fail
-                        self.to_screen(f'Guessed URL failed: {test_url} ({e})')
-                        pass # Continue trying other combinations
-
-        # *** If all strategies fail ***
-        self.to_screen('All extraction strategies failed.')
-
-        # Provide helpful error with suggestions
-        error_message = (
-            f"Could not extract stream URL for {display_id or url}. "
-            "The stream may be old, expired, or use an unsupported format.\n"
-            f"Live status detected: {is_live}\n"
-            "Common issues:\n"
-            "- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n"
-            "- The event might not be available via the standard CDN endpoints/channels.\n"
-            "If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n"
-            "Example (using parsed times, adjust if needed):\n"
+        # Process all found .m3u8 links (handles case where multiple are found or the first one is a master playlist)
+        # The regex used here is identical to the one above, ensures we capture all instances
+        import re
+        all_links_text = self._html_search_regex(
+             r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)',
+             webpage, 'all m3u8 URLs', default='', fatal=False, group=0 # Find all occurrences
        )
-        if start_timestamp and end_timestamp:
-             example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
-             error_message += f'yt-dlp "{example_url}"'
-        else:
-             example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8"
-             error_message += f'yt-dlp "{example_url}"'
+        candidates = re.findall(r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', all_links_text)
+
+        # If the specific constructed URL was made above, ensure it's prioritized or the only candidate
+        # (Refined logic needed here based on the modification above)
+        if not candidates and m3u8_links: # Fallback if findall failed but initial search worked
+             candidates = [m3u8_links]
+        elif m3u8_links not in candidates and m3u8_links: # Ensure the primary (possibly constructed) link is included
+             candidates.insert(0, m3u8_links)
+
+        candidates = list(dict.fromkeys(candidates)) # Make unique, preserving order
+
+        if not candidates: # Final check if still no candidates
+             self.report_warning('Could not extract any valid .m3u8 URLs.')
+             return {
+                 'id': media_info.get('id') or display_id,
+                 'display_id': display_id,
+                 'title': title,
+                 'release_timestamp': release_timestamp,
+                 'formats': [],
+             }
+
+
+        formats, subtitles = [], {}
+        for link in candidates:
+            # Pass the identified m3u8 URL (could be live, index-archive, or norsk-archive)
+            # The 'live' flag might need adjustment based on mediaSubType
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                link, display_id, ext='mp4', live=is_live, fatal=False) # Pass is_live status
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)

-
-        raise ExtractorError(error_message, expected=True)
+        return {
+            'id': media_info.get('id') or display_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'release_timestamp': release_timestamp,
+             # Report 'is_live' based on detected mediaSubType
+            'is_live': is_live or None # Report None if not explicitly Live
+        }