Update europa.py

pull/12742/head
edmundman 6 months ago committed by GitHub
parent db1f9be975
commit d597dc61a2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -12,7 +12,8 @@ from ..utils import (
unified_strdate, unified_strdate,
xpath_text, xpath_text,
) )
import re # Import re for findall # Removed unused 're' import, added 'urllib.parse' for potential future use if needed
# but not strictly required for current modification.
# --- EuropaIE (Older extractor - unchanged) --- # --- EuropaIE (Older extractor - unchanged) ---
# This extractor handles older ec.europa.eu/avservices URLs and is likely defunct. # This extractor handles older ec.europa.eu/avservices URLs and is likely defunct.
@ -67,14 +68,14 @@ class EuropaIE(InfoExtractor):
return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats} return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) --- # --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) ---
class EuroParlWebstreamIE(InfoExtractor): class EuroParlWebstreamIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?://multimedia\.europarl\.europa\.eu/ https?://multimedia\.europarl\.europa\.eu/
(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format
''' '''
_TESTS = [{ _TESTS = [{
# Existing VOD test # Existing VOD test (Should now work better if metadata is consistent)
'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
'info_dict': { 'info_dict': {
'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY', 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY',
@ -82,19 +83,18 @@ class EuroParlWebstreamIE(InfoExtractor):
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, { }, {
# Test case that previously failed with regex method # Test case likely representing an archive/VOD (based on previous context)
'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA', 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA',
'info_dict': { 'info_dict': {
'id': str, # ID might be a string UUID or similar 'id': str, # ID might be a string UUID or similar
'display_id': '20250328-1000-SPECIAL-EUROSCOLA', 'display_id': '20250328-1000-SPECIAL-EUROSCOLA',
'ext': 'mp4', 'ext': 'mp4',
'title': r're:Euroscola', # Expect title containing Euroscola 'title': r're:Euroscola', # Expect title containing Euroscola
'release_timestamp': int, # Expecting a Unix timestamp 'release_timestamp': int, # Expecting a Unix timestamp (start time)
'release_date': '20250328', 'release_date': '20250328',
'is_live': bool, # Could be True (if near event time) or False 'is_live': False, # Should be detected as not live
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
# Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -109,70 +109,98 @@ class EuroParlWebstreamIE(InfoExtractor):
# Extract basic info, falling back to display_id if metadata is sparse # Extract basic info, falling back to display_id if metadata is sparse
internal_id = media_info.get('id') or display_id internal_id = media_info.get('id') or display_id
title = media_info.get('title') or media_info.get('name') or display_id title = media_info.get('title') or media_info.get('name') or display_id
release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}))
# Extract start and end timestamps, if available
# parse_iso8601 typically returns a float/int timestamp
start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none}))
end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none}))
release_timestamp = start_timestamp # Use start time as the release timestamp
# Determine live status based on metadata hint, if available # Determine live status based on metadata hint, if available
# Treat as not live if 'Live' subtype isn't explicitly present
is_live = media_info.get('mediaSubType') == 'Live' is_live = media_info.get('mediaSubType') == 'Live'
hls_url = None # Variable to store the found HLS URL hls_url = None # Variable to store the found HLS URL
# --- Attempt 1: Find direct HLS URL in media_info --- # --- Attempt 1: Find direct HLS URL in media_info ---
# Check common dictionary keys where the full HLS URL might be stored. # Check common dictionary keys where the full HLS URL might be stored.
# Add more potential keys here if observed in website data.
possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl') possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl')
hls_url = traverse_obj(media_info, possible_keys) hls_url = traverse_obj(media_info, possible_keys)
if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL
self.to_screen(f'Found direct HLS URL in metadata: {hls_url}') self.to_screen(f'Found direct HLS URL in metadata: {hls_url}')
# Check if it's an archive URL but missing time params - might need correction later if it fails
if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.')
hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
self.to_screen(f'Corrected direct HLS URL: {hls_url}')
else: else:
hls_url = None # Reset if found value wasn't an HLS URL hls_url = None # Reset if found value wasn't an HLS URL or needs construction
# --- Attempt 2: Construct HLS URL from IDs in media_info --- # --- Attempt 2: Construct HLS URL from IDs and Times in media_info ---
if not hls_url: if not hls_url:
self.to_screen('Attempting to construct HLS URL from metadata IDs...') self.to_screen('Attempting to construct HLS URL from metadata...')
# Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common,
# but might differ. Use traverse_obj to safely get values.
# 'id' from media_info is often the event ID.
event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id')) event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id'))
# Channel ID might be numeric or a string name.
channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel')) channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel'))
if event_id and channel_id: if event_id and channel_id:
# Construct the URL using the assumed live/default pattern. if not is_live and start_timestamp and end_timestamp:
# For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed. # Construct ARCHIVE/VOD URL with time parameters
# This assumes the event is live or uses the default endpoint. constructed_url = (
f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/'
f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}'
)
hls_url = constructed_url
self.to_screen(f'Constructed Archive HLS URL: {hls_url}')
elif is_live:
# Construct LIVE URL (basic pattern, might need adjustments)
constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8' constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
hls_url = constructed_url hls_url = constructed_url
self.to_screen(f'Constructed potential HLS URL: {hls_url}') self.to_screen(f'Constructed Live HLS URL: {hls_url}')
else:
self.to_screen('Could not construct URL: Missing live status or timestamps for archive.')
else: else:
self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.') self.to_screen('Could not construct URL: Missing event or channel ID in metadata.')
# --- Attempt 3: Fallback to regex search on raw webpage (Original Method) --- # --- Attempt 3: Fallback to regex search on raw webpage (Original Method) ---
if not hls_url: if not hls_url:
self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...') self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...')
m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)' # Updated regex to potentially capture archive URLs with parameters, but prioritize construction
m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)'
hls_url = self._search_regex( hls_url = self._search_regex(
m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False) m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False)
if hls_url: if hls_url:
self.to_screen(f'Found HLS URL via regex fallback: {hls_url}') self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
# If regex found an archive URL without params, try adding them as a last resort
if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.')
hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
self.to_screen(f'Corrected regex HLS URL: {hls_url}')
else: else:
# This is where the original "Could not find any .m3u8 link" warning occurred.
self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.') self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
# --- Process HLS Playlist --- # --- Process HLS Playlist ---
if not hls_url: if not hls_url:
# If no URL was found after all attempts, raise an error.
raise ExtractorError( raise ExtractorError(
'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.', 'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
expected=True) # expected=True prevents stack trace for common errors expected=True)
# Pass the found HLS URL to the HLS processing function. # Pass the final HLS URL to the processing function
# The _extract_m3u8_formats function usually detects live/VOD automatically.
# The 'live=is_live' hint can sometimes help but isn't strictly necessary.
formats, subtitles = self._extract_m3u8_formats_and_subtitles( formats, subtitles = self._extract_m3u8_formats_and_subtitles(
hls_url, display_id, ext='mp4', live=is_live, fatal=False) hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats
# Check if HLS processing returned any formats # Check if HLS processing returned any formats
if not formats: if not formats:
raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True) # Try again, forcing VOD interpretation if it was marked live but failed
if is_live:
self.to_screen('Live HLS processing failed, attempting again as VOD...')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
hls_url, display_id, ext='mp4', live=False, fatal=False)
# If still no formats, raise error
if not formats:
raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True)
# --- Return Extracted Information --- # --- Return Extracted Information ---
return { return {
@ -182,5 +210,5 @@ class EuroParlWebstreamIE(InfoExtractor):
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'release_timestamp': release_timestamp, 'release_timestamp': release_timestamp,
'is_live': is_live or None, # Use None if not explicitly marked Live 'is_live': is_live, # Keep original detected live status
} }

Loading…
Cancel
Save