You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yt-dlp/yt_dlp/extractor/tvp.py

601 lines
23 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import itertools
import random
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
determine_ext,
dict_get,
int_or_none,
js_to_json,
str_or_none,
strip_or_none,
traverse_obj,
try_get,
url_or_none,
)
class TVPIE(InfoExtractor):
IE_NAME = 'tvp'
IE_DESC = 'Telewizja Polska'
_VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|tvpworld\.com|belsat\.eu)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)(?:[/?#]|$)'
_TESTS = [{
# TVPlayer 3
'url': 'https://wilno.tvp.pl/75865949/rozmowa-tygodnia-z-andriusem-vainysem-o-wizycie-s-holowni',
'info_dict': {
'id': '75866176',
'ext': 'mp4',
'title': 'Rozmowa tygodnia z Andriusem Vaišnysem o wizycie S. Hołowni',
'alt_title': 'md5:51cc9faf4623ba33aa5191bb83f3f76a',
'duration': 169,
'age_limit': 0,
'release_timestamp': 1707591120,
'release_date': '20240210',
'thumbnail': r're:https://.+',
},
}, {
# TVPlayer 2 (JSON)
'url': 'https://jp2.tvp.pl/48566934/o-suwerennosci-narodu-i-upadku-totalitaryzmu-przemowienie-powitalne',
'info_dict': {
'id': '48566934',
'ext': 'mp4',
'title': 'O suwerenności narodu i upadku totalitaryzmu. Przemówienie powitalne',
'duration': 527,
'age_limit': 0,
'release_timestamp': 1592388480,
'release_date': '20200617',
'thumbnail': r're:https://.+',
},
}, {
# TVPlayer legacy
'url': 'https://www.tvp.pl/polska-press-video-uploader/wideo/62042351',
'info_dict': {
'id': '62042351',
'ext': 'mp4',
'title': 'Wideo',
'description': 'Wideo Kamera',
'duration': 24,
'age_limit': 0,
'thumbnail': r're:https://.+',
},
}, {
# client-side rendered (regional) program (playlist) page
'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia',
'info_dict': {
'id': '9660819',
'description': 'Od poniedziałku do piątku o 19:00.',
'title': 'Rozmowa dnia',
},
'playlist_mincount': 1800,
'params': {
'skip_download': True,
},
}, {
# yet another vue page
'url': 'https://jp2.tvp.pl/46925618/filmy',
'info_dict': {
'id': '46925618',
'title': 'Filmy',
},
'playlist_mincount': 27,
}, {
'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
'only_matching': True,
}, {
'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
'only_matching': True,
}, {
'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej',
'only_matching': True,
}, {
'url': 'https://tvpworld.com/48583640/tescos-polish-business-bought-by-danish-chain-netto',
'only_matching': True,
}, {
'url': 'https://belsat.eu/83193018/vybary-jak-castka-hibrydnaj-vajny',
'only_matching': True,
}]
def _parse_video(self, url, video_data, page_id):
video_id = str(video_data.get('_id'))
return {
'_type': 'url_transparent',
'url': f'tvp:{video_id}',
'ie_key': TVPEmbedIE.ie_key(),
'id': video_id,
**traverse_obj(video_data, {
'title': 'title',
'duration': 'duration',
'is_live': 'is_live',
'release_timestamp': ('release_date', {int_or_none(scale=1000)}),
}),
}
def _parse_news(self, url, news_data, page_id):
videos = [self._parse_video(url, video_data, page_id) for video_data in traverse_obj(news_data, ('video', 'items'))]
info_dict = {
'id': str_or_none(news_data.get('id')) or page_id,
'title': news_data['title'],
'alt_title': news_data.get('lead'),
'description': news_data.get('description'),
}
if len(videos) == 1:
return {**info_dict, **videos[0]}
return {
**info_dict,
'_type': 'playlist',
'entries': videos,
}
def _get_website_entries(self, url, website_data, page_id, data_type='website'):
parser = self._parse_video
if data_type == 'directory':
parser = self._parse_directory_website
def extract_videos(wd):
if wd.get('latestVideo'):
yield parser(url, wd['latestVideo'], page_id)
for video in wd.get('videos') or []:
yield parser(url, video, page_id)
for video in wd.get('items') or []:
yield parser(url, video, page_id)
yield from extract_videos(website_data)
if website_data.get('items_total_count') > website_data.get('items_per_page'):
for page in itertools.count(2):
page_website_data = self._find_data(data_type, self._download_webpage(
url, page_id, note=f'Downloading {data_type} page #{page}',
query={'page': page}), page_id)
if not page_website_data.get('videos') and not page_website_data.get('items'):
break
yield from extract_videos(page_website_data)
def _parse_website(self, url, website_data, page_id):
return {
'_type': 'playlist',
'entries': self._get_website_entries(url, website_data, page_id),
'id': page_id,
'title': website_data.get('title'),
'description': website_data.get('lead'),
}
def _parse_directory_website(self, url, website_data, page_id):
website_id = str_or_none(website_data.get('_id'))
return {
'_type': 'url_transparent',
'url': website_data['url'],
'id': website_id,
'title': website_data.get('title'),
'description': website_data.get('lead'),
}
def _parse_directory(self, url, directory_data, page_id):
return {
'_type': 'playlist',
'entries': self._get_website_entries(url, directory_data, page_id, data_type='directory'),
'id': page_id,
'title': directory_data.get('title'),
'description': directory_data.get('lead'),
}
def _find_data(self, data_type, webpage, video_id, **kwargs):
return self._search_json(
rf'window\.__{data_type}Data\s*=', webpage, f'{data_type} data', video_id,
transform_source=js_to_json, **kwargs)
def _real_extract(self, url):
page_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, page_id)
# The URL may redirect to a VOD
# example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii
for ie_cls in (TVPVODSeriesIE, TVPVODVideoIE):
if ie_cls.suitable(urlh.url):
return self.url_result(urlh.url, ie=ie_cls.ie_key(), video_id=page_id)
for (dt, parse) in (
('news', self._parse_news),
('video', self._parse_video),
('website', self._parse_website),
('directory', self._parse_directory),
):
data = self._find_data(dt, webpage, page_id, default=None)
if data:
return parse(url, data, page_id)
# classic server-side rendered sites
video_id = self._search_regex([
r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)',
r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
r"object_id\s*:\s*'(\d+)'",
r'data-video-id="(\d+)"',
], webpage, 'video id', default=page_id)
return {
'_type': 'url_transparent',
'url': 'tvp:' + video_id,
'description': self._og_search_description(
webpage, default=None) or (self._html_search_meta(
'description', webpage, default=None)
if '//s.tvp.pl/files/portal/v' in webpage else None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'ie_key': 'TVPEmbed',
}
class TVPStreamIE(InfoExtractor):
IE_NAME = 'tvp:stream'
_VALID_URL = r'(?:tvpstream:|https?://(?:tvpstream\.vod|stream)\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
_TESTS = [{
'url': 'https://stream.tvp.pl/?channel_id=56969941',
'only_matching': True,
}, {
# untestable as "video" id changes many times across a day
'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
'only_matching': True,
}, {
'url': 'tvpstream:39821455',
'only_matching': True,
}, {
# the default stream when you provide no channel_id, most probably TVP Info
'url': 'tvpstream:',
'only_matching': True,
}, {
'url': 'https://tvpstream.vod.tvp.pl/',
'only_matching': True,
}]
def _real_extract(self, url):
channel_id = self._match_id(url)
channel_url = self._proto_relative_url(f'//stream.tvp.pl/?channel_id={channel_id}' or 'default')
webpage = self._download_webpage(channel_url, channel_id or 'default', 'Downloading channel webpage')
channels = self._search_json(
r'window\.__channels\s*=', webpage, 'channel list', channel_id,
contains_pattern=r'\[\s*{(?s:.+)}\s*]')
channel = traverse_obj(channels, (lambda _, v: channel_id == str(v['id'])), get_all=False) if channel_id else channels[0]
audition = traverse_obj(channel, ('items', lambda _, v: v['is_live'] is True), get_all=False)
return {
'_type': 'url_transparent',
'id': channel_id or channel['id'],
'url': 'tvp:{}'.format(audition['video_id']),
'title': audition.get('title'),
'alt_title': channel.get('title'),
'is_live': True,
'ie_key': 'TVPEmbed',
}
class TVPEmbedIE(InfoExtractor):
IE_NAME = 'tvp:embed'
IE_DESC = 'Telewizja Polska'
_GEO_BYPASS = False
_VALID_URL = r'''(?x)
(?:
tvp:
|https?://
(?:[^/]+\.)?
(?:tvp(?:parlament)?\.pl|tvp\.info|tvpworld\.com|swipeto\.pl)/
(?:sess/
(?:tvplayer\.php\?.*?object_id
|TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
|shared/details\.php\?.*?object_id)
=)
(?P<id>\d+)
'''
_EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL[4:]})']
_TESTS = [{
'url': 'tvp:194536',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:76649d2014f65c99477be17f23a4dead',
'age_limit': 12,
'duration': 2652,
'series': 'Czas honoru',
'episode': 'Episode 13',
'episode_number': 13,
'season': 'sezon 1',
'thumbnail': r're:https://.+',
},
}, {
'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&amp;autoplay=false',
'info_dict': {
'id': '51247504',
'ext': 'mp4',
'title': 'Razmova 091220',
'duration': 876,
'age_limit': 0,
'thumbnail': r're:https://.+',
},
}, {
# TVPlayer2 embed URL
'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757',
'only_matching': True,
}, {
'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452',
'only_matching': True,
}, {
# pulsembed on dziennik.pl
'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
# it could be anything that is a valid JS function name
callback = random.choice((
'jebac_pis',
'jebacpis',
'ziobro',
'sasin70',
'sasin_przejebal_70_milionow_PLN',
'tvp_is_a_state_propaganda_service',
))
webpage = self._download_webpage(
f'https://www.tvp.pl/sess/TVPlayer2/api.php?id={video_id}&@method=getTvpConfig&@callback={callback}', video_id)
# stripping JSONP padding
datastr = webpage[15 + len(callback):-3]
if datastr.startswith('null,'):
error = self._parse_json(datastr[5:], video_id, fatal=False)
error_desc = traverse_obj(error, (0, 'desc'))
if error_desc == 'Obiekt wymaga płatności':
raise ExtractorError('Video requires payment and log-in, but log-in is not implemented')
raise ExtractorError(error_desc or 'unexpected JSON error')
content = self._parse_json(datastr, video_id)['content']
info = content['info']
is_live = try_get(info, lambda x: x['isLive'], bool)
if info.get('isGeoBlocked'):
# actual country list is not provided, we just assume it's always available in PL
self.raise_geo_restricted(countries=['PL'])
formats = []
for file in content['files']:
video_url = url_or_none(file.get('url'))
if not video_url:
continue
ext = determine_ext(video_url, None)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
elif ext == 'mpd':
if is_live:
# doesn't work with either ffmpeg or native downloader
continue
formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
elif video_url.endswith('.ism/manifest'):
formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
else:
formats.append({
'format_id': 'direct',
'url': video_url,
'ext': ext or file.get('type'),
'fps': int_or_none(traverse_obj(file, ('quality', 'fps'))),
'tbr': int_or_none(traverse_obj(file, ('quality', 'bitrate')), scale=1000),
'width': int_or_none(traverse_obj(file, ('quality', 'width'))),
'height': int_or_none(traverse_obj(file, ('quality', 'height'))),
})
title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
description = dict_get(info, ('description', 'seoDescription'))
thumbnails = []
for thumb in content.get('posters') or ():
thumb_url = thumb.get('src')
if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
continue
thumbnails.append({
'url': thumb.get('src'),
'width': thumb.get('width'),
'height': thumb.get('height'),
})
age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
if age_limit == 1:
age_limit = 0
duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
subtitles = {}
for sub in content.get('subtitles') or []:
if not sub.get('url'):
continue
subtitles.setdefault(sub['lang'], []).append({
'url': sub['url'],
'ext': sub.get('type'),
})
info_dict = {
'id': video_id,
'title': title,
'description': description,
'thumbnails': thumbnails,
'age_limit': age_limit,
'is_live': is_live,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
}
# vod.tvp.pl
if info.get('vortalName') == 'vod':
info_dict.update({
'title': '{}, {}'.format(info.get('title'), info.get('subtitle')),
'series': info.get('title'),
'season': info.get('season'),
'episode_number': info.get('episode'),
})
return info_dict
class TVPVODBaseIE(InfoExtractor):
_API_BASE_URL = 'https://vod.tvp.pl/api/products'
def _call_api(self, resource, video_id, query={}, **kwargs):
is_valid = lambda x: 200 <= x < 300
document, urlh = self._download_json_handle(
f'{self._API_BASE_URL}/{resource}', video_id,
query={'lang': 'pl', 'platform': 'BROWSER', **query},
expected_status=lambda x: is_valid(x) or 400 <= x < 500, **kwargs)
if is_valid(urlh.status):
return document
raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.status})')
def _parse_video(self, video, with_url=True):
info_dict = traverse_obj(video, {
'id': ('id', {str_or_none}),
'title': 'title',
'age_limit': ('rating', {int_or_none}),
'duration': ('duration', {int_or_none}),
'episode_number': ('number', {int_or_none}),
'series': ('season', 'serial', 'title', {str_or_none}),
'thumbnails': ('images', ..., ..., {'url': ('url', {url_or_none})}),
})
info_dict['description'] = clean_html(dict_get(video, ('lead', 'description')))
if with_url:
info_dict.update({
'_type': 'url',
'url': video['webUrl'],
'ie_key': TVPVODVideoIE.ie_key(),
})
return info_dict
class TVPVODVideoIE(TVPVODBaseIE):
IE_NAME = 'tvp:vod'
_VALID_URL = r'https?://vod\.tvp\.pl/(?P<category>[a-z\d-]+,\d+)/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357',
'info_dict': {
'id': '311357',
'ext': 'mp4',
'title': 'Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24',
'description': 'md5:1d4098d3e537092ccbac1abf49b7cd4c',
'duration': 300,
'episode_number': 24,
'episode': 'Episode 24',
'age_limit': 0,
'series': 'Laboratorium alchemika',
'thumbnail': 're:https?://.+',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/ukrainski-sluga-narodu,339667',
'info_dict': {
'id': '339667',
'ext': 'mp4',
'title': 'Ukraiński sługa narodu',
'description': 'md5:b7940c0a8e439b0c81653a986f544ef3',
'age_limit': 12,
'duration': 3051,
'thumbnail': 're:https?://.+',
'subtitles': 'count:2',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'embed fails with "payment required"',
'url': 'https://vod.tvp.pl/seriale,18/polowanie-na-cmy-odcinki,390116/odcinek-7,S01E07,398869',
'info_dict': {
'id': '398869',
'ext': 'mp4',
'title': 'odc. 7',
'description': 'md5:dd2bb33f023dc5c2fbaddfbe4cb5dba0',
'duration': 2750,
'age_limit': 16,
'series': 'Polowanie na ćmy',
'episode_number': 7,
'episode': 'Episode 7',
'thumbnail': 're:https?://.+',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://vod.tvp.pl/live,1/tvp-world,399731',
'info_dict': {
'id': '399731',
'ext': 'mp4',
'title': r're:TVP WORLD \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'live_status': 'is_live',
'thumbnail': 're:https?://.+',
},
}]
def _real_extract(self, url):
category, video_id = self._match_valid_url(url).group('category', 'id')
is_live = category == 'live,1'
entity = 'lives' if is_live else 'vods'
info_dict = self._parse_video(self._call_api(f'{entity}/{video_id}', video_id), with_url=False)
playlist = self._call_api(f'{video_id}/videos/playlist', video_id, query={'videoType': 'MOVIE'})
info_dict['formats'] = []
for manifest_url in traverse_obj(playlist, ('sources', 'HLS', ..., 'src')):
info_dict['formats'].extend(self._extract_m3u8_formats(manifest_url, video_id, fatal=False))
for manifest_url in traverse_obj(playlist, ('sources', 'DASH', ..., 'src')):
info_dict['formats'].extend(self._extract_mpd_formats(manifest_url, video_id, fatal=False))
info_dict['subtitles'] = {}
for sub in playlist.get('subtitles') or []:
info_dict['subtitles'].setdefault(sub.get('language') or 'und', []).append({
'url': sub['url'],
'ext': 'ttml',
})
info_dict['is_live'] = is_live
return info_dict
class TVPVODSeriesIE(TVPVODBaseIE):
IE_NAME = 'tvp:vod:series'
_VALID_URL = r'https?://vod\.tvp\.pl/[a-z\d-]+,\d+/[a-z\d-]+-odcinki,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$'
_TESTS = [{
'url': 'https://vod.tvp.pl/seriale,18/ranczo-odcinki,316445',
'info_dict': {
'id': '316445',
'title': 'Ranczo',
'age_limit': 12,
'categories': ['seriale'],
},
'playlist_count': 130,
}, {
'url': 'https://vod.tvp.pl/programy,88/rolnik-szuka-zony-odcinki,284514',
'only_matching': True,
}, {
'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338',
'only_matching': True,
}]
def _entries(self, seasons, playlist_id):
for season in seasons:
episodes = self._call_api(
f'vods/serials/{playlist_id}/seasons/{season["id"]}/episodes', playlist_id,
note=f'Downloading episode list for {season["title"]}')
yield from map(self._parse_video, episodes)
def _real_extract(self, url):
playlist_id = self._match_id(url)
metadata = self._call_api(
f'vods/serials/{playlist_id}', playlist_id,
note='Downloading serial metadata')
seasons = self._call_api(
f'vods/serials/{playlist_id}/seasons', playlist_id,
note='Downloading season list')
return self.playlist_result(
self._entries(seasons, playlist_id), playlist_id, strip_or_none(metadata.get('title')),
clean_html(traverse_obj(metadata, ('description', 'lead'), expected_type=strip_or_none)),
categories=[traverse_obj(metadata, ('mainCategory', 'name'))],
age_limit=int_or_none(metadata.get('rating')),
)