Update europa.py

pull/12742/head
edmundman 6 months ago committed by GitHub
parent fe08c6ca27
commit 6e3ddbbe4d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1,4 +1,3 @@
# coding: utf-8
from .common import InfoExtractor
from ..utils import (
int_or_none,
@ -10,15 +9,7 @@ from ..utils import (
traverse_obj,
unified_strdate,
xpath_text,
ExtractorError,
js_to_json,
urljoin
)
import re
import json
import time
import datetime
class EuropaIE(InfoExtractor):
_WORKING = False
@ -54,7 +45,10 @@ class EuropaIE(InfoExtractor):
def get_item(type_, preference):
items = {}
for item in playlist.findall(f'./info/{type_}/item'):
lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
lang, label = (
xpath_text(item, 'lg', default=None),
xpath_text(item, 'label', default=None)
)
if lang and label:
items[lang] = label.strip()
for p in preference:
@ -63,7 +57,6 @@ class EuropaIE(InfoExtractor):
query = parse_qs(url)
preferred_lang = query.get('sitelang', ('en', ))[0]
preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
title = get_item('title', preferred_langs) or video_id
@ -102,320 +95,131 @@ class EuropaIE(InfoExtractor):
class EuroParlWebstreamIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://(?:
multimedia\.europarl\.europa\.eu/(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)|
live\.media\.eup\.glcloud\.eu/hls/live/(?P<live_id>[\w-]+)/(?P<channel>channel-\d+-\w+|[\w-]+)/(?:input/\d+/\d+/[\w-]+/)?(?P<stream_id>[\w.-]+)(?:\.m3u8|/master\.m3u8|\?) # Allow dots and hyphens in stream_id, make .m3u8 optional if query follows
)
https?://multimedia\.europarl\.europa\.eu/
(?:\w+/)?webstreaming/(?:[\w-]+_)?(?P<id>[\w-]+)
'''
_TESTS = [{
'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
'info_dict': {
'id': '20220914-0900-PLENARY',
'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d',
'display_id': '20220914-0900-PLENARY',
'ext': 'mp4',
'title': 'Plenary session',
'release_timestamp': 1663139069,
'release_date': '20220914',
},
'params': {
'skip_download': True,
},
}, {
# Direct HLS stream URL (archive example similar to user provided)
'url': 'https://live.media.eup.glcloud.eu/hls/live/2113713/channel-01-stb/input/1/256/p1080___6798871408e31898bdd1a1af/norsk-archive.m3u8?startTime=1743152400&endTime=1743162442',
# example of old live webstream
'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA',
'info_dict': {
'id': 'norsk-archive', # ID derived from filename before query
'ext': 'mp4',
'title': 'European Parliament Stream',
},
'params': {
'skip_download': True,
},
},{
# Direct HLS stream URL (live example)
'url': 'https://live.media.eup.glcloud.eu/hls/live/2113753/channel-07-bxl/index.m3u8',
'info_dict': {
'id': 'index',
'ext': 'mp4',
'title': 'European Parliament Stream',
},
'params': {
'skip_download': True,
'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715',
'release_timestamp': 1668502800,
'title': 'Euroscola 2022-11-15 19:21',
'release_date': '20221115',
'live_status': 'is_live',
},
'skip': 'not live anymore',
}]
# Known CDN endpoints - try these if direct extraction fails
# Added 2113713 and 2113713-b based on user's M3U8
ENDPOINTS = ["2113753", "2113713", "2113713-b"]
# Priority channels based on observed success rates & user M3U8
# Added channel-01-stb
PRIORITY_CHANNELS = ["channel-07-bxl", "channel-01-stb", "channel-01-bxl", "channel-10-bxl"]
# Default stream types/filenames by content type
# These are used in the *fallback* guessing logic.
# The complex paths like input/1/256/... seen in the user M3U8 CANNOT be guessed.
LIVE_STREAM_FILENAMES = ["index.m3u8", "master.m3u8", "playlist.m3u8"]
ARCHIVE_STREAM_FILENAMES = ["index-archive.m3u8", "norsk-archive.m3u8", "index.m3u8", "master.m3u8"]
def _extract_direct_url_from_webpage(self, webpage):
"""Extract direct m3u8 URLs from webpage with minimal logging"""
m3u8_urls = set() # Use a set to avoid duplicates
# Search patterns for m3u8 URLs
# Added more flexibility for quotes and paths
for pattern in [
r'["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8(?:\?[^"\'\s]*)?)["\']',
r'"url"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
# Look for assignments or attributes
r'=\s*["\'](https?://live\.media\.eup\.glcloud\.eu/[^"\'\s]+\.m3u8[^"\']*)["\']',
# Look for URLs within JSON-like structures in script tags
r'"src"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
r'"file"\s*:\s*"(https?://live\.media\.eup\.glcloud\.eu/[^"]+\.m3u8[^"]*)"',
]:
matches = re.findall(pattern, webpage)
for match in matches:
# Handle potential tuple results from findall if multiple groups exist in regex
url_match = match if isinstance(match, str) else match[0]
# Basic sanity check
if '.m3u8' in url_match and 'live.media.eup.glcloud.eu' in url_match:
# Remove any JS string escaping
url_match = url_match.replace('\\/', '/').replace('\\\\', '\\')
m3u8_urls.add(url_match)
# Extract from network panel if available (less reliable parsing)
network_url_match = re.search(r'Request URL:[\s\n]*(?:<[^>]+>)?[\s\n]*(https://live\.media\.eup\.glcloud\.eu/[^\s<]+\.m3u8[^\s<]*)', webpage, re.IGNORECASE)
if network_url_match:
url_match = network_url_match.group(1).replace('\\/', '/').replace('\\\\', '\\')
m3u8_urls.add(url_match)
self.to_screen(f'Found {len(m3u8_urls)} potential direct M3U8 URLs in webpage')
return list(m3u8_urls)
def _extract_title_from_webpage(self, webpage, display_id):
"""Extract the title from the webpage"""
# Try different patterns to extract the title
for pattern in [
r'<meta property="og:title" content="([^"]+)"',
r'<title>([^<]+)</title>',
r'<h1[^>]*class="erpl_title-h1"[^>]*>([^<]+)</h1>', # Specific title class
r'<h1[^>]*>([^<]+)</h1>',
r'"title"\s*:\s*"([^"]+)"',
]:
title_match = re.search(pattern, webpage)
if title_match:
title = title_match.group(1).strip()
# Clean up common suffixes
title = re.sub(r'\s*\|\s*European Parliament$', '', title).strip()
title = re.sub(r'\s*-\s*Multimedia Centre$', '', title).strip()
if title:
return title
return f"European Parliament Session - {display_id}" # Fallback title
def _parse_meeting_date(self, display_id):
"""Parse the date from the meeting ID format (YYYYMMDD-HHMM-TYPE)"""
date_match = re.match(r'(\d{8})-(\d{4})-(.+)', display_id)
if date_match:
date_str, time_str, _ = date_match.groups()
try:
# Parse the date components
year = int(date_str[:4])
month = int(date_str[4:6])
day = int(date_str[6:8])
hour = int(time_str[:2])
minute = int(time_str[2:4])
# Create timestamps with a generous window (e.g., 3 hours before, 6 hours after)
# This helps catch streams that start slightly early or run long
meeting_dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc) # Assume UTC
start_dt = meeting_dt - datetime.timedelta(hours=3)
end_dt = meeting_dt + datetime.timedelta(hours=6) # Increased end window
# Convert to Unix timestamps
start_ts = int(start_dt.timestamp())
end_ts = int(end_dt.timestamp())
self.to_screen(f'Parsed date {date_str}-{time_str}. Using archive time window: {start_ts} to {end_ts}')
return start_ts, end_ts
except (ValueError, OverflowError) as e:
self.to_screen(f'Error parsing date from display_id "{display_id}": {e}')
pass # Fall through to fallback
# Fallback to a recent window if parsing fails or ID format is different
self.to_screen(f'Could not parse specific date from "{display_id}". Using generic recent time window.')
now = int(time.time())
start_time = now - (24 * 3600) # 24 hours ago (might be too short for older archives)
end_time = now + (1 * 3600) # 1 hour in the future (for live/recent)
return start_time, end_time
def _real_extract(self, url):
mobj = self._match_valid_url(url)
# Get potential IDs from the regex match groups
display_id = mobj.group('id')
live_id = mobj.group('live_id')
stream_id = mobj.group('stream_id')
channel = mobj.group('channel')
# Use the most specific ID available
video_id = display_id or stream_id or live_id or channel
# Handle direct HLS URLs first (most reliable if provided)
if live_id and (stream_id or channel):
# Clean up stream_id (remove query parameters for use as info dict id)
clean_stream_id = stream_id.split('?')[0] if stream_id and '?' in stream_id else stream_id
# If stream_id is missing but channel exists, use channel as part of the id
final_id = clean_stream_id or channel or 'unknown_stream'
# Remove potential .m3u8 suffix for cleaner ID
if final_id.endswith('.m3u8'):
final_id = final_id[:-5]
self.to_screen(f'Processing direct HLS URL: {url}')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
url, final_id, 'mp4', m3u8_id='hls', fatal=False, quiet=True) # Don't fail hard if extraction issues
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
if not formats:
self.report_warning(f'Could not extract any formats from the direct M3U8 URL: {url}')
# Optionally, you could attempt webpage download here as a fallback, but direct URLs should ideally work
# raise ExtractorError('Failed to extract formats from direct HLS URL.', expected=True)
# Try to parse Next.js data for metadata
nextjs = self._search_nextjs_data(webpage, display_id, default={})
page_props = traverse_obj(nextjs, ('props', 'pageProps'), default={})
media_info = page_props.get('mediaItem') or {} # Look for start/end times here for archives?
title = media_info.get('title') or media_info.get('name') or display_id
release_timestamp = None
# Existing logic uses startDateTime, might need adjustment for archive start/end
if 'startDateTime' in media_info:
release_timestamp = parse_iso8601(media_info['startDateTime'])
# Determine if it's Live or VOD/Archive (might need refinement)
# mediaSubType might be 'Live' or 'VOD' or something else
is_live = media_info.get('mediaSubType') == 'Live'
# Search for any .m3u8 link first
m3u8_links = self._search_regex(
r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)',
webpage, 'm3u8 URL', default=None, group=1, fatal=False
)
# --- Potential modification area START ---
# If it's NOT live, and we have start/end times, and m3u8_links points to a live URL,
# try constructing the index-archive.m3u8 URL here.
# Example (conceptual - requires actual start/end times and base URL logic):
# if not is_live and media_info.get('startTime') and media_info.get('endTime'):
# start_time = media_info['startTime'] # Assuming these keys exist and hold timestamps
# end_time = media_info['endTime']
# # Assuming m3u8_links contains a base URL that needs modification
# base_url = m3u8_links.split('/')[0:-1] # Highly simplified base URL extraction
# archive_url = '/'.join(base_url) + f'/index-archive.m3u8?startTime={start_time}&endTime={end_time}'
# m3u8_links = archive_url # Replace the found link with the constructed one
# --- Potential modification area END ---
if not m3u8_links:
self.report_warning('Could not find any .m3u8 link in the page. The site structure may have changed.')
# Return basic info if no HLS manifest found
return {
'id': final_id,
'title': 'European Parliament Stream', # Generic title for direct URLs
'formats': formats or [],
'subtitles': subtitles or {},
'is_live': '?startTime=' not in url and 'archive' not in url.lower(), # Basic guess based on URL
'id': media_info.get('id') or display_id,
'display_id': display_id,
'title': title,
'release_timestamp': release_timestamp,
'formats': [],
}
# --- Fallback for multimedia.europarl.europa.eu URLs ---
if not display_id: # Should have display_id if it's not a direct HLS URL
raise ExtractorError('Failed to identify video ID from URL.')
self.to_screen(f'Processing webpage URL: {url}')
webpage = self._download_webpage(url, display_id)
# Check for live indicators more reliably
# Look for common live indicators in JS, classes, or text
is_live = bool(re.search(
r'(?:isLive\s*:\s*true|"liveStatus"\s*:\s*"live"|player-live|Live now|En direct|IN DIRETTA|EN VIVO|NA ŻYWO)',
webpage,
re.IGNORECASE))
self.to_screen(f'Detected as live: {is_live}')
# Extract title
title = self._extract_title_from_webpage(webpage, display_id)
# *** Strategy 1: Extract direct URLs from webpage (Preferred) ***
direct_urls = self._extract_direct_url_from_webpage(webpage)
formats = []
subtitles = {}
if direct_urls:
self.to_screen(f'Attempting extraction from {len(direct_urls)} direct URLs found in webpage...')
for m3u8_url in direct_urls:
# Clean stream ID from URL for format identification
m3u8_stream_id = m3u8_url.split('/')[-1].split('?')[0]
if m3u8_stream_id.endswith('.m3u8'):
m3u8_stream_id = m3u8_stream_id[:-5]
try:
fmt, subs = self._extract_m3u8_formats_and_subtitles(
m3u8_url, display_id, 'mp4', m3u8_id=f'hls-{m3u8_stream_id}', fatal=False) # Don't stop on first error
if fmt:
self.to_screen(f'Successfully extracted formats from: {m3u8_url}')
formats.extend(fmt)
self._merge_subtitles(subs, target=subtitles)
# If we found formats, we are likely done, return immediately
return {
'id': display_id,
'display_id': display_id,
'title': title,
'formats': formats,
'subtitles': subtitles,
'is_live': is_live or ('?startTime=' not in m3u8_url and 'archive' not in m3u8_url.lower()), # Refine live status based on URL
}
else:
self.to_screen(f'No formats found in: {m3u8_url}')
except ExtractorError as e:
self.to_screen(f'Error extracting from direct URL {m3u8_url}: {e}')
pass # Try the next direct URL
else:
self.to_screen('No direct M3U8 URLs found in webpage.')
# *** Strategy 2: Fallback - Guessing URLs (Less Reliable, esp. for complex paths) ***
self.to_screen('Attempting fallback URL guessing strategy (may not work for all streams)...')
# Parse timestamps for archive retrieval (or use a window for live/unknown)
# Always parse, even if live, as it might be a recently finished live event
start_timestamp, end_timestamp = self._parse_meeting_date(display_id)
# Use appropriate stream filenames for the content type
stream_filenames = self.LIVE_STREAM_FILENAMES if is_live else self.ARCHIVE_STREAM_FILENAMES
# Try combinations with updated endpoints and channels
for endpoint in self.ENDPOINTS:
for channel_to_try in self.PRIORITY_CHANNELS:
for filename in stream_filenames:
base_url = f"https://live.media.eup.glcloud.eu/hls/live/{endpoint}/{channel_to_try}/{filename}"
# Determine if timestamps should be added
# Add timestamps if it's explicitly not live, OR if the filename suggests archive,
# OR if start/end timestamps were successfully parsed from the ID.
# Avoid timestamps for clearly live filenames unless forced by non-live status.
use_timestamps = (
(not is_live or 'archive' in filename.lower())
and start_timestamp and end_timestamp
)
test_url = f"{base_url}?startTime={start_timestamp}&endTime={end_timestamp}" if use_timestamps else base_url
try:
self.to_screen(f'Trying guessed URL: {test_url}')
fmt, subs = self._extract_m3u8_formats_and_subtitles(
test_url, display_id, 'mp4', m3u8_id=f'hls-guessed-{channel_to_try}-{filename.replace(".m3u8", "")}', fatal=False)
if fmt:
self.to_screen(f'Success with guessed URL: {test_url}')
formats.extend(fmt)
self._merge_subtitles(subs, target=subtitles)
# Found a working combination
return {
'id': display_id,
'display_id': display_id,
'title': title,
'formats': formats,
'subtitles': subtitles,
'is_live': not use_timestamps, # If we used timestamps, assume not live
}
else:
self.to_screen(f'No formats found in guessed URL: {test_url}')
except ExtractorError as e:
# Log error lightly, as many guesses are expected to fail
self.to_screen(f'Guessed URL failed: {test_url} ({e})')
pass # Continue trying other combinations
# *** If all strategies fail ***
self.to_screen('All extraction strategies failed.')
# Provide helpful error with suggestions
error_message = (
f"Could not extract stream URL for {display_id or url}. "
"The stream may be old, expired, or use an unsupported format.\n"
f"Live status detected: {is_live}\n"
"Common issues:\n"
"- The specific URL structure (especially for archives like 'norsk-archive.m3u8' with deep paths) might not be guessable.\n"
"- The event might not be available via the standard CDN endpoints/channels.\n"
"If you know the direct `.m3u8` URL, try using it with yt-dlp directly.\n"
"Example (using parsed times, adjust if needed):\n"
# Process all found .m3u8 links (handles case where multiple are found or the first one is a master playlist)
# The regex used here is identical to the one above, ensures we capture all instances
import re
all_links_text = self._html_search_regex(
r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)',
webpage, 'all m3u8 URLs', default='', fatal=False, group=0 # Find all occurrences
)
if start_timestamp and end_timestamp:
example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index-archive.m3u8?startTime={start_timestamp}&endTime={end_timestamp}"
error_message += f'yt-dlp "{example_url}"'
else:
example_url = f"https://live.media.eup.glcloud.eu/hls/live/{self.ENDPOINTS[0]}/{self.PRIORITY_CHANNELS[0]}/index.m3u8"
error_message += f'yt-dlp "{example_url}"'
candidates = re.findall(r'(https?://[^"]+live\.media\.eup\.glcloud\.eu/hls/live/\d+/[^"]+\.m3u8[^"]*)', all_links_text)
# If the specific constructed URL was made above, ensure it's prioritized or the only candidate
# (Refined logic needed here based on the modification above)
if not candidates and m3u8_links: # Fallback if findall failed but initial search worked
candidates = [m3u8_links]
elif m3u8_links not in candidates and m3u8_links: # Ensure the primary (possibly constructed) link is included
candidates.insert(0, m3u8_links)
candidates = list(dict.fromkeys(candidates)) # Make unique, preserving order
if not candidates: # Final check if still no candidates
self.report_warning('Could not extract any valid .m3u8 URLs.')
return {
'id': media_info.get('id') or display_id,
'display_id': display_id,
'title': title,
'release_timestamp': release_timestamp,
'formats': [],
}
formats, subtitles = [], {}
for link in candidates:
# Pass the identified m3u8 URL (could be live, index-archive, or norsk-archive)
# The 'live' flag might need adjustment based on mediaSubType
fmts, subs = self._extract_m3u8_formats_and_subtitles(
link, display_id, ext='mp4', live=is_live, fatal=False) # Pass is_live status
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
raise ExtractorError(error_message, expected=True)
return {
'id': media_info.get('id') or display_id,
'display_id': display_id,
'title': title,
'formats': formats,
'subtitles': subtitles,
'release_timestamp': release_timestamp,
# Report 'is_live' based on detected mediaSubType
'is_live': is_live or None # Report None if not explicitly Live
}

Loading…
Cancel
Save