|
|
@ -12,7 +12,8 @@ from ..utils import (
|
|
|
|
unified_strdate,
|
|
|
|
unified_strdate,
|
|
|
|
xpath_text,
|
|
|
|
xpath_text,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
import re # Import re for findall
|
|
|
|
# Removed unused 're' import, added 'urllib.parse' for potential future use if needed
|
|
|
|
|
|
|
|
# but not strictly required for current modification.
|
|
|
|
|
|
|
|
|
|
|
|
# --- EuropaIE (Older extractor - unchanged) ---
|
|
|
|
# --- EuropaIE (Older extractor - unchanged) ---
|
|
|
|
# This extractor handles older ec.europa.eu/avservices URLs and is likely defunct.
|
|
|
|
# This extractor handles older ec.europa.eu/avservices URLs and is likely defunct.
|
|
|
@ -67,14 +68,14 @@ class EuropaIE(InfoExtractor):
|
|
|
|
return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
|
|
|
|
return {'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'formats': formats}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- EuroParlWebstreamIE (Modified extractor to handle potential site changes) ---
|
|
|
|
# --- EuroParlWebstreamIE (Modified extractor to handle VOD/Archive streams correctly) ---
|
|
|
|
class EuroParlWebstreamIE(InfoExtractor):
|
|
|
|
class EuroParlWebstreamIE(InfoExtractor):
|
|
|
|
_VALID_URL = r'''(?x)
|
|
|
|
_VALID_URL = r'''(?x)
|
|
|
|
https?://multimedia\.europarl\.europa\.eu/
|
|
|
|
https?://multimedia\.europarl\.europa\.eu/
|
|
|
|
(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format
|
|
|
|
(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+) # Matches /en/webstreaming/event_id format
|
|
|
|
'''
|
|
|
|
'''
|
|
|
|
_TESTS = [{
|
|
|
|
_TESTS = [{
|
|
|
|
# Existing VOD test
|
|
|
|
# Existing VOD test (Should now work better if metadata is consistent)
|
|
|
|
'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
|
|
|
|
'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
|
|
|
|
'info_dict': {
|
|
|
|
'info_dict': {
|
|
|
|
'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY',
|
|
|
|
'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'display_id': '20220914-0900-PLENARY',
|
|
|
@ -82,19 +83,18 @@ class EuroParlWebstreamIE(InfoExtractor):
|
|
|
|
},
|
|
|
|
},
|
|
|
|
'params': {'skip_download': True},
|
|
|
|
'params': {'skip_download': True},
|
|
|
|
}, {
|
|
|
|
}, {
|
|
|
|
# Test case that previously failed with regex method
|
|
|
|
# Test case likely representing an archive/VOD (based on previous context)
|
|
|
|
'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA',
|
|
|
|
'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20250328-1000-SPECIAL-EUROSCOLA',
|
|
|
|
'info_dict': {
|
|
|
|
'info_dict': {
|
|
|
|
'id': str, # ID might be a string UUID or similar
|
|
|
|
'id': str, # ID might be a string UUID or similar
|
|
|
|
'display_id': '20250328-1000-SPECIAL-EUROSCOLA',
|
|
|
|
'display_id': '20250328-1000-SPECIAL-EUROSCOLA',
|
|
|
|
'ext': 'mp4',
|
|
|
|
'ext': 'mp4',
|
|
|
|
'title': r're:Euroscola', # Expect title containing Euroscola
|
|
|
|
'title': r're:Euroscola', # Expect title containing Euroscola
|
|
|
|
'release_timestamp': int, # Expecting a Unix timestamp
|
|
|
|
'release_timestamp': int, # Expecting a Unix timestamp (start time)
|
|
|
|
'release_date': '20250328',
|
|
|
|
'release_date': '20250328',
|
|
|
|
'is_live': bool, # Could be True (if near event time) or False
|
|
|
|
'is_live': False, # Should be detected as not live
|
|
|
|
},
|
|
|
|
},
|
|
|
|
'params': {'skip_download': True},
|
|
|
|
'params': {'skip_download': True},
|
|
|
|
# Note: This test might fail after 2025-03-28 if the URL becomes invalid or content changes significantly
|
|
|
|
|
|
|
|
}]
|
|
|
|
}]
|
|
|
|
|
|
|
|
|
|
|
|
def _real_extract(self, url):
|
|
|
|
def _real_extract(self, url):
|
|
|
@ -109,70 +109,98 @@ class EuroParlWebstreamIE(InfoExtractor):
|
|
|
|
# Extract basic info, falling back to display_id if metadata is sparse
|
|
|
|
# Extract basic info, falling back to display_id if metadata is sparse
|
|
|
|
internal_id = media_info.get('id') or display_id
|
|
|
|
internal_id = media_info.get('id') or display_id
|
|
|
|
title = media_info.get('title') or media_info.get('name') or display_id
|
|
|
|
title = media_info.get('title') or media_info.get('name') or display_id
|
|
|
|
release_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}))
|
|
|
|
|
|
|
|
|
|
|
|
# Extract start and end timestamps, if available
|
|
|
|
|
|
|
|
# parse_iso8601 typically returns a float/int timestamp
|
|
|
|
|
|
|
|
start_timestamp = traverse_obj(media_info, ('startDateTime', {parse_iso8601}, {int_or_none}))
|
|
|
|
|
|
|
|
end_timestamp = traverse_obj(media_info, ('endDateTime', {parse_iso8601}, {int_or_none}))
|
|
|
|
|
|
|
|
release_timestamp = start_timestamp # Use start time as the release timestamp
|
|
|
|
|
|
|
|
|
|
|
|
# Determine live status based on metadata hint, if available
|
|
|
|
# Determine live status based on metadata hint, if available
|
|
|
|
|
|
|
|
# Treat as not live if 'Live' subtype isn't explicitly present
|
|
|
|
is_live = media_info.get('mediaSubType') == 'Live'
|
|
|
|
is_live = media_info.get('mediaSubType') == 'Live'
|
|
|
|
|
|
|
|
|
|
|
|
hls_url = None # Variable to store the found HLS URL
|
|
|
|
hls_url = None # Variable to store the found HLS URL
|
|
|
|
|
|
|
|
|
|
|
|
# --- Attempt 1: Find direct HLS URL in media_info ---
|
|
|
|
# --- Attempt 1: Find direct HLS URL in media_info ---
|
|
|
|
# Check common dictionary keys where the full HLS URL might be stored.
|
|
|
|
# Check common dictionary keys where the full HLS URL might be stored.
|
|
|
|
# Add more potential keys here if observed in website data.
|
|
|
|
|
|
|
|
possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl')
|
|
|
|
possible_keys = ('hlsUrl', 'streamUrl', 'manifestUrl', 'url', 'playerUrl', 'videoUrl')
|
|
|
|
hls_url = traverse_obj(media_info, possible_keys)
|
|
|
|
hls_url = traverse_obj(media_info, possible_keys)
|
|
|
|
if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL
|
|
|
|
if hls_url and 'm3u8' in hls_url: # Basic check if it looks like an HLS URL
|
|
|
|
self.to_screen(f'Found direct HLS URL in metadata: {hls_url}')
|
|
|
|
self.to_screen(f'Found direct HLS URL in metadata: {hls_url}')
|
|
|
|
|
|
|
|
# Check if it's an archive URL but missing time params - might need correction later if it fails
|
|
|
|
|
|
|
|
if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
|
|
|
|
|
|
|
|
self.to_screen('Direct URL looks like archive but missing time params, attempting to add them.')
|
|
|
|
|
|
|
|
hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
|
|
|
|
|
|
|
|
self.to_screen(f'Corrected direct HLS URL: {hls_url}')
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
hls_url = None # Reset if found value wasn't an HLS URL
|
|
|
|
hls_url = None # Reset if found value wasn't an HLS URL or needs construction
|
|
|
|
|
|
|
|
|
|
|
|
# --- Attempt 2: Construct HLS URL from IDs in media_info ---
|
|
|
|
# --- Attempt 2: Construct HLS URL from IDs and Times in media_info ---
|
|
|
|
if not hls_url:
|
|
|
|
if not hls_url:
|
|
|
|
self.to_screen('Attempting to construct HLS URL from metadata IDs...')
|
|
|
|
self.to_screen('Attempting to construct HLS URL from metadata...')
|
|
|
|
# Try to extract relevant IDs. Keys like 'eventId', 'channelId' are common,
|
|
|
|
|
|
|
|
# but might differ. Use traverse_obj to safely get values.
|
|
|
|
|
|
|
|
# 'id' from media_info is often the event ID.
|
|
|
|
|
|
|
|
event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id'))
|
|
|
|
event_id = traverse_obj(media_info, ('id', 'eventId', 'event_id'))
|
|
|
|
# Channel ID might be numeric or a string name.
|
|
|
|
|
|
|
|
channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel'))
|
|
|
|
channel_id = traverse_obj(media_info, ('channelId', 'channel_id', 'channelName', 'channel'))
|
|
|
|
|
|
|
|
|
|
|
|
if event_id and channel_id:
|
|
|
|
if event_id and channel_id:
|
|
|
|
# Construct the URL using the assumed live/default pattern.
|
|
|
|
if not is_live and start_timestamp and end_timestamp:
|
|
|
|
# For archive/VOD, '/index-archive.m3u8?startTime=...&endTime=...' might be needed.
|
|
|
|
# Construct ARCHIVE/VOD URL with time parameters
|
|
|
|
# This assumes the event is live or uses the default endpoint.
|
|
|
|
constructed_url = (
|
|
|
|
|
|
|
|
f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/'
|
|
|
|
|
|
|
|
f'index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}'
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
hls_url = constructed_url
|
|
|
|
|
|
|
|
self.to_screen(f'Constructed Archive HLS URL: {hls_url}')
|
|
|
|
|
|
|
|
elif is_live:
|
|
|
|
|
|
|
|
# Construct LIVE URL (basic pattern, might need adjustments)
|
|
|
|
constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
|
|
|
|
constructed_url = f'https://live.media.eup.glcloud.eu/hls/live/{event_id}/{channel_id}/index.m3u8'
|
|
|
|
hls_url = constructed_url
|
|
|
|
hls_url = constructed_url
|
|
|
|
self.to_screen(f'Constructed potential HLS URL: {hls_url}')
|
|
|
|
self.to_screen(f'Constructed Live HLS URL: {hls_url}')
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
self.to_screen('Could not find sufficient event/channel IDs in metadata to construct URL.')
|
|
|
|
self.to_screen('Could not construct URL: Missing live status or timestamps for archive.')
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
self.to_screen('Could not construct URL: Missing event or channel ID in metadata.')
|
|
|
|
|
|
|
|
|
|
|
|
# --- Attempt 3: Fallback to regex search on raw webpage (Original Method) ---
|
|
|
|
# --- Attempt 3: Fallback to regex search on raw webpage (Original Method) ---
|
|
|
|
if not hls_url:
|
|
|
|
if not hls_url:
|
|
|
|
self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...')
|
|
|
|
self.to_screen('Could not find or construct HLS URL from metadata, trying webpage regex search...')
|
|
|
|
m3u8_url_pattern = r'(https?://[^"]*live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)'
|
|
|
|
# Updated regex to potentially capture archive URLs with parameters, but prioritize construction
|
|
|
|
|
|
|
|
m3u8_url_pattern = r'(https?://[^"\']*\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"\']+\.m3u8[^"\']*)'
|
|
|
|
hls_url = self._search_regex(
|
|
|
|
hls_url = self._search_regex(
|
|
|
|
m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False)
|
|
|
|
m3u8_url_pattern, webpage, 'm3u8 URL (regex fallback)', default=None, fatal=False)
|
|
|
|
if hls_url:
|
|
|
|
if hls_url:
|
|
|
|
self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
|
|
|
|
self.to_screen(f'Found HLS URL via regex fallback: {hls_url}')
|
|
|
|
|
|
|
|
# If regex found an archive URL without params, try adding them as a last resort
|
|
|
|
|
|
|
|
if not is_live and 'index-archive.m3u8' in hls_url and '?startTime=' not in hls_url and start_timestamp and end_timestamp:
|
|
|
|
|
|
|
|
self.to_screen('Regex URL looks like archive but missing time params, attempting to add them.')
|
|
|
|
|
|
|
|
hls_url = f'{hls_url.split("?")[0]}?startTime={start_timestamp}&endTime={end_timestamp}'
|
|
|
|
|
|
|
|
self.to_screen(f'Corrected regex HLS URL: {hls_url}')
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
# This is where the original "Could not find any .m3u8 link" warning occurred.
|
|
|
|
|
|
|
|
self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
|
|
|
|
self.report_warning('Could not find any .m3u8 link via metadata or webpage regex.')
|
|
|
|
|
|
|
|
|
|
|
|
# --- Process HLS Playlist ---
|
|
|
|
# --- Process HLS Playlist ---
|
|
|
|
if not hls_url:
|
|
|
|
if not hls_url:
|
|
|
|
# If no URL was found after all attempts, raise an error.
|
|
|
|
|
|
|
|
raise ExtractorError(
|
|
|
|
raise ExtractorError(
|
|
|
|
'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
|
|
|
|
'No HLS URL (.m3u8) could be found or constructed. The website structure might have changed.',
|
|
|
|
expected=True) # expected=True prevents stack trace for common errors
|
|
|
|
expected=True)
|
|
|
|
|
|
|
|
|
|
|
|
# Pass the found HLS URL to the HLS processing function.
|
|
|
|
# Pass the final HLS URL to the processing function
|
|
|
|
# The _extract_m3u8_formats function usually detects live/VOD automatically.
|
|
|
|
|
|
|
|
# The 'live=is_live' hint can sometimes help but isn't strictly necessary.
|
|
|
|
|
|
|
|
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
|
|
|
|
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
|
|
|
|
hls_url, display_id, ext='mp4', live=is_live, fatal=False)
|
|
|
|
hls_url, display_id, ext='mp4', live=is_live, fatal=False) # fatal=False allows checking empty formats
|
|
|
|
|
|
|
|
|
|
|
|
# Check if HLS processing returned any formats
|
|
|
|
# Check if HLS processing returned any formats
|
|
|
|
if not formats:
|
|
|
|
if not formats:
|
|
|
|
raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats.', expected=True)
|
|
|
|
# Try again, forcing VOD interpretation if it was marked live but failed
|
|
|
|
|
|
|
|
if is_live:
|
|
|
|
|
|
|
|
self.to_screen('Live HLS processing failed, attempting again as VOD...')
|
|
|
|
|
|
|
|
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
|
|
|
|
|
|
|
|
hls_url, display_id, ext='mp4', live=False, fatal=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# If still no formats, raise error
|
|
|
|
|
|
|
|
if not formats:
|
|
|
|
|
|
|
|
raise ExtractorError(f'HLS manifest found at {hls_url} but yielded no video formats, even after retry.', expected=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- Return Extracted Information ---
|
|
|
|
# --- Return Extracted Information ---
|
|
|
|
return {
|
|
|
|
return {
|
|
|
@ -182,5 +210,5 @@ class EuroParlWebstreamIE(InfoExtractor):
|
|
|
|
'formats': formats,
|
|
|
|
'formats': formats,
|
|
|
|
'subtitles': subtitles,
|
|
|
|
'subtitles': subtitles,
|
|
|
|
'release_timestamp': release_timestamp,
|
|
|
|
'release_timestamp': release_timestamp,
|
|
|
|
'is_live': is_live or None, # Use None if not explicitly marked Live
|
|
|
|
'is_live': is_live, # Keep original detected live status
|
|
|
|
}
|
|
|
|
}
|
|
|
|