Update europa.py

pull/12742/head
edmundman 6 months ago committed by GitHub
parent db1f9be975
commit d597dc61a2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -12,7 +12,8 @@ from ..utils import (
unified_strdate,
xpath_text,
)
import re # Import re for findall
# Removed unused 're' import, added 'urllib.parse' for potential future use if needed
# but not strictly required for current modification.
# --- EuropaIE (Older extractor - unchanged) ---
# This extractor handles older ec.europa.eu/avservices URLs and is likely defunct.
@ -67,14 +68,14 @@ class EuropaIE(InfoExtractor):
return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) ---
# --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) ---
class EuroParlWebstreamIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://multimedia\.europarl\.europa\.eu/
(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format
'''
_TESTS = [{
# Existing VOD test
# Existing VOD test (Should now work better if metadata is consistent)
'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
'info_dict': {
'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY',
@ -82,19 +83,18 @@ class EuroParlWebstreamIE(InfoExtractor):
},
'params': {'skip_download': True},
}, {
# Test case that previously failed with regex method
# Test case likely representing an archive/VOD (based on previous context)
'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA',
'info_dict': {
'id': str, # ID might be a string UUID or similar
'display_id': '20250328-1000-SPECIAL-EUROSCOLA',
'ext': 'mp4',
'title': r're:Euroscola', # Expect title containing Euroscola
'release_timestamp': int, # Expecting a Unix timestamp
'release_timestamp': int, # Expecting a Unix timestamp (start time)
'release_date': '20250328',
'is_live': bool, # Could be True (if near event time) or False
'is_live': False, # Should be detected as not live
},
'params': {'skip_download': True},
# Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly
}]
def _real_extract(self, url):
@ -109,70 +109,98 @@ class EuroParlWebstreamIE(InfoExtractor):
# Extract basic info, falling back to display_id if metadata is sparse
internal_id = media_info.get('id') or display_id
title = media_info.get('title') or media_info.get('name') or display_id
release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}))
# Extract start and end timestamps, if available
# parse_iso8601 typically returns a float/int timestamp
start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none}))
end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none}))
release_timestamp = start_timestamp # Use start time as the release timestamp
# Determine live status based on metadata hint, if available
# Treat as not live if 'Live' subtype isn't explicitly present
is_live = media_info.get('mediaSubType') == 'Live'
hls_url = None # Variable to store the found HLS URL
# --- Attempt 1: Find direct HLS URL in media_info ---
# Check common dictionary keys where the full HLS URL might be stored.
# Add more potential keys here if observed in website data.
possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl')
hls_url = traverse_obj(media_info, possible_keys)
if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL
self.to_screen(f'Found direct HLS URL in metadata: {hls_url}')
# Check if it's an archive URL but missing time params - might need correction later if it fails
if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.')
hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
self.to_screen(f'Corrected direct HLS URL: {hls_url}')
else:
hls_url = None # Reset if found value wasn't an HLS URL
hls_url = None # Reset if found value wasn't an HLS URL or needs construction
# --- Attempt 2: Construct HLS URL from IDs in media_info ---
# --- Attempt 2: Construct HLS URL from IDs and Times in media_info ---
if not hls_url:
self.to_screen('Attempting to construct HLS URL from metadata IDs...')
# Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common,
# but might differ. Use traverse_obj to safely get values.
# 'id' from media_info is often the event ID.
self.to_screen('Attempting to construct HLS URL from metadata...')
event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id'))
# Channel ID might be numeric or a string name.
channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel'))
if event_id and channel_id:
# Construct the URL using the assumed live/default pattern.
# For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed.
# This assumes the event is live or uses the default endpoint.
if not is_live and start_timestamp and end_timestamp:
# Construct ARCHIVE/VOD URL with time parameters
constructed_url = (
f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/'
f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}'
)
hls_url = constructed_url
self.to_screen(f'Constructed Archive HLS URL: {hls_url}')
elif is_live:
# Construct LIVE URL (basic pattern, might need adjustments)
constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
hls_url = constructed_url
self.to_screen(f'Constructed potential HLS URL: {hls_url}')
self.to_screen(f'Constructed Live HLS URL: {hls_url}')
else:
self.to_screen('Could not construct URL: Missing live status or timestamps for archive.')
else:
self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.')
self.to_screen('Could not construct URL: Missing event or channel ID in metadata.')
# --- Attempt 3: Fallback to regex search on raw webpage (Original Method) ---
if not hls_url:
self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...')
m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)'
# Updated regex to potentially capture archive URLs with parameters, but prioritize construction
m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)'
hls_url = self._search_regex(
m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False)
if hls_url:
self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
# If regex found an archive URL without params, try adding them as a last resort
if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.')
hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
self.to_screen(f'Corrected regex HLS URL: {hls_url}')
else:
# This is where the original "Could not find any .m3u8 link" warning occurred.
self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
# --- Process HLS Playlist ---
if not hls_url:
# If no URL was found after all attempts, raise an error.
raise ExtractorError(
'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
expected=True) # expected=True prevents stack trace for common errors
expected=True)
# Pass the found HLS URL to the HLS processing function.
# The _extract_m3u8_formats function usually detects live/VOD automatically.
# The 'live=is_live' hint can sometimes help but isn't strictly necessary.
# Pass the final HLS URL to the processing function
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
hls_url, display_id, ext='mp4', live=is_live, fatal=False)
hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats
# Check if HLS processing returned any formats
if not formats:
raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True)
# Try again, forcing VOD interpretation if it was marked live but failed
if is_live:
self.to_screen('Live HLS processing failed, attempting again as VOD...')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
hls_url, display_id, ext='mp4', live=False, fatal=False)
# If still no formats, raise error
if not formats:
raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True)
# --- Return Extracted Information ---
return {
@ -182,5 +210,5 @@ class EuroParlWebstreamIE(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
'release_timestamp': release_timestamp,
'is_live': is_live or None, # Use None if not explicitly marked Live
'is_live': is_live, # Keep original detected live status
}

Loading…
Cancel
Save