@ -2,251 +2,234 @@
from __future__ import unicode_literals
import re
import json
from . common import InfoExtractor
from . . compat import (
compat_str ,
compat_urlparse ,
)
from . . utils import (
ExtractorError ,
GeoRestrictedError ,
int_or_none ,
qualities ,
try_get ,
merge_dicts ,
parse_iso8601 ,
parse_qs ,
strip_or_none ,
traverse_obj ,
url_or_none ,
urljoin ,
)
class ArteTVBaseIE ( InfoExtractor ) :
_ARTE_LANGUAGES = ' fr|de|en|es|it|pl '
_API_BASE_V1 = ' https://api.arte.tv/api/player/v1 '
_API_BASE_V2 = ' https://api.arte.tv/api/player/v2 '
def _get_api_authorization_header ( self , url ) :
""" Fetches the Authorization header required for api.arte.tv/api/player/v2 """
# actually this request is only for making the authorization
# requirements for api/player/v2 fullfilled, but it contains some
# metadata too since we have to request this page anyway.
html_page = self . _download_webpage ( url , ' dummy_auth_request_with_some_meta ' )
page_metadata_json = self . _search_regex (
r ' window.__INITIAL_STATE__ = ( \ { .* \ }); \ n ' , html_page , ' initial_state ' )
if page_metadata_json :
page_metadata = json . loads ( page_metadata_json )
else :
page_metadata = { }
manifest_js = self . _download_webpage (
' https://static-cdn.arte.tv/guide/manifest.js ' , ' arte_api_token ' )
token = self . _search_regex (
r ' " default " : { " token " : " ([a-zA-Z0-9_-]*) " } ' , manifest_js , ' token ' )
return {
' page_metadata ' : page_metadata ,
' headers ' : {
' Authorization ' : ' Bearer %s ' % ( token ) ,
' Accept ' : ' application/json, text/plain, */* ' ,
' Accept-Language ' : ' en-GB,en;q=0.8,de-DE;q=0.5,de;q=0.3 ' ,
' Referer ' : url ,
' Origin ' : ' https://www.arte.tv ' ,
' Connection ' : ' keep-alive ' ,
' Sec-Fetch-Dest ' : ' empty ' ,
' Sec-Fetch-Mode ' : ' cors ' ,
' Sec-Fetch-Site ' : ' same-site ' ,
' Pragma ' : ' no-cache ' ,
' Cache-Control ' : ' no-cache ' ,
' TE ' : ' trailers ' ,
' User-Agent ' : ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:90.0) Gecko/20100101 Firefox/90.0 '
}
}
_API_BASE = ' https://api.arte.tv/api/player/v2 '
@classmethod
def _match_valid_url ( cls , url ) :
return re . match ( cls . _VALID_URL , url )
def _extract_m3u8_formats_and_subtitles ( self , * args , * * kwargs ) :
return self . _extract_m3u8_formats ( * args , * * kwargs ) , { }
class ArteTVIE ( ArteTVBaseIE ) :
_VALID_URL = r ''' (?x)
https ? : / /
( ? : https ? : / /
( ? :
( ? : www \. ) ? arte \. tv / ( ? P < lang > % ( langs ) s ) / videos |
api \. arte \. tv / api / player / v \d + / config / ( ? P < lang_2 > % ( langs ) s )
)
/ ( ? P < id > \d { 6 } - \d { 3 } - [ AF ] )
| arte : / / program )
/ ( ? P < id > \d { 6 } - \d { 3 } - [ AF ] | LIVE )
''' % { ' langs ' : ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [ {
' url ' : ' https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/ ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/ ' ,
' info_dict ' : {
' id ' : ' 088501-000-A ' ,
' id ' : ' 100103-000-A ' ,
' title ' : ' USA: Dyskryminacja na porodówce ' ,
' description ' : ' md5:242017b7cce59ffae340a54baefcafb1 ' ,
' alt_title ' : ' ARTE Reportage ' ,
' upload_date ' : ' 20201103 ' ,
' duration ' : 554 ,
' thumbnail ' : r ' re:https://api-cdn \ .arte \ .tv/.+940x530 ' ,
' timestamp ' : 1604417980 ,
' ext ' : ' mp4 ' ,
' title ' : ' Mexico: Stealing Petrol to Survive ' ,
' upload_date ' : ' 20190628 ' ,
} ,
' params ' : {
' format ' : ' bestvideo ' ,
' skip_download ' : ' m3u8 ' ,
} ,
} , {
' url ' : ' https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/ ' ,
' only_matching ' : True ,
' note ' : ' No alt_title ' ,
' url ' : ' https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/ ' ,
' info_dict ' : {
' id ' : ' 110371-000-A ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20220718 ' ,
' duration ' : 154 ,
' timestamp ' : 1658162460 ,
' description ' : ' md5:5890f36fe7dccfadb8b7c0891de54786 ' ,
' title ' : ' La chaleur, supplice des arbres de rue ' ,
' thumbnail ' : ' https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530 ' ,
} ,
' params ' : {
' format ' : ' bestvideo ' ,
' skip_download ' : ' m3u8 ' ,
} ,
} , {
' url ' : ' https://api.arte.tv/api/player/v2/config/de/100605-013-A ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://api.arte.tv/api/player/v2/config/de/LIVE ' ,
' only_matching ' : True ,
} ]
_GEO_BYPASS = True
_LANG_MAP = { # ISO639 -> French abbreviations
' r ' : ' F ' ,
' de ' : ' A ' ,
' en ' : ' E[ANG] ' ,
' es ' : ' E[ESP] ' ,
' it ' : ' E[ITA] ' ,
' pl ' : ' E[POL] ' ,
# XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
# uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
' mul ' : ' EU ' ,
}
_VERSION_CODE_RE = re . compile ( r ''' (?x)
V
( ? P < original_voice > O ? )
( ? P < vlang > [ FA ] | E \[ [ A - Z ] + \] | EU ) ?
( ? P < audio_desc > AUD | )
( ? :
( ? P < has_sub > - ST )
( ? P < sdh_sub > M ? )
( ? P < sub_lang > [ FA ] | E \[ [ A - Z ] + \] | EU )
) ?
''' )
# all obtained by exhaustive testing
_COUNTRIES_MAP = {
' DE_FR ' : (
' BL ' , ' DE ' , ' FR ' , ' GF ' , ' GP ' , ' MF ' , ' MQ ' , ' NC ' ,
' PF ' , ' PM ' , ' RE ' , ' WF ' , ' YT ' ,
) ,
# with both of the below 'BE' sometimes works, sometimes doesn't
' EUR_DE_FR ' : (
' AT ' , ' BL ' , ' CH ' , ' DE ' , ' FR ' , ' GF ' , ' GP ' , ' LI ' ,
' MC ' , ' MF ' , ' MQ ' , ' NC ' , ' PF ' , ' PM ' , ' RE ' , ' WF ' ,
' YT ' ,
) ,
' SAT ' : (
' AD ' , ' AT ' , ' AX ' , ' BG ' , ' BL ' , ' CH ' , ' CY ' , ' CZ ' ,
' DE ' , ' DK ' , ' EE ' , ' ES ' , ' FI ' , ' FR ' , ' GB ' , ' GF ' ,
' GR ' , ' HR ' , ' HU ' , ' IE ' , ' IS ' , ' IT ' , ' KN ' , ' LI ' ,
' LT ' , ' LU ' , ' LV ' , ' MC ' , ' MF ' , ' MQ ' , ' MT ' , ' NC ' ,
' NL ' , ' NO ' , ' PF ' , ' PL ' , ' PM ' , ' PT ' , ' RE ' , ' RO ' ,
' SE ' , ' SI ' , ' SK ' , ' SM ' , ' VA ' , ' WF ' , ' YT ' ,
) ,
}
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
mobj = self . _match_valid_url ( url )
video_id = mobj . group ( ' id ' )
lang = mobj . group ( ' lang ' ) or mobj . group ( ' lang_2 ' )
langauge_code = self . _LANG_MAP . get ( lang )
config = self . _download_json ( ' {0} /config/ {1} / {2} ' . format ( self . _API_BASE , lang , video_id ) , video_id )
geoblocking = traverse_obj ( config , ( ' data ' , ' attributes ' , ' restriction ' , ' geoblocking ' ) ) or { }
if geoblocking . get ( ' restrictedArea ' ) :
raise GeoRestrictedError ( ' Video restricted to {0!r} ' . format ( geoblocking [ ' code ' ] ) ,
countries = self . _COUNTRIES_MAP . get ( geoblocking [ ' code ' ] , ( ' DE ' , ' FR ' ) ) )
if not traverse_obj ( config , ( ' data ' , ' attributes ' , ' rights ' ) ) :
# Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
# Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
raise ExtractorError (
' Video is not available in this language edition of Arte or broadcast rights expired ' , expected = True )
formats , subtitles = [ ] , { }
secondary_formats = [ ]
for stream in config [ ' data ' ] [ ' attributes ' ] [ ' streams ' ] :
# official player contains code like `e.get("versions")[0].eStat.ml5`
stream_version = stream [ ' versions ' ] [ 0 ]
stream_version_code = stream_version [ ' eStat ' ] [ ' ml5 ' ]
lang_pref = - 1
m = self . _VERSION_CODE_RE . match ( stream_version_code )
if m :
lang_pref = int ( ' ' . join ( ' 01 ' [ x ] for x in (
m . group ( ' vlang ' ) == langauge_code , # we prefer voice in the requested language
not m . group ( ' audio_desc ' ) , # and not the audio description version
bool ( m . group ( ' original_voice ' ) ) , # but if voice is not in the requested language, at least choose the original voice
m . group ( ' sub_lang ' ) == langauge_code , # if subtitles are present, we prefer them in the requested language
not m . group ( ' has_sub ' ) , # but we prefer no subtitles otherwise
not m . group ( ' sdh_sub ' ) , # and we prefer not the hard-of-hearing subtitles if there are subtitles
) ) )
short_label = traverse_obj ( stream_version , ' shortLabel ' , expected_type = str , default = ' ? ' )
if stream [ ' protocol ' ] . startswith ( ' HLS ' ) :
fmts , subs = self . _extract_m3u8_formats_and_subtitles (
stream [ ' url ' ] , video_id = video_id , ext = ' mp4 ' , m3u8_id = stream_version_code , fatal = False )
for fmt in fmts :
fmt . update ( {
' format_note ' : ' {0} [ {1} ] ' . format ( stream_version . get ( " label " , " unknown " ) , short_label ) ,
' language_preference ' : lang_pref ,
} )
if any ( map ( short_label . startswith , ( ' cc ' , ' OGsub ' ) ) ) :
secondary_formats . extend ( fmts )
else :
formats . extend ( fmts )
for sub in subs :
subtitles = self . _merge_subtitles ( subtitles , sub )
elif stream [ ' protocol ' ] in ( ' HTTPS ' , ' RTMP ' ) :
formats . append ( {
' format_id ' : ' {0} - {1} ' . format ( stream [ " protocol " ] , stream_version_code ) ,
' url ' : stream [ ' url ' ] ,
' format_note ' : ' {0} [ {1} ] ' . format ( stream_version . get ( " label " , " unknown " ) , short_label ) ,
' language_preference ' : lang_pref ,
# 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
} )
# legacy for debugging only
legacy_info = self . _download_json (
' %s /config/ %s / %s ' % ( self . _API_BASE_V1 , lang , video_id ) , video_id )
player_info = legacy_info . get ( ' data ' )
vsr = try_get ( player_info , lambda x : x [ ' VSR ' ] , dict )
# v2 api stuff
auth_data = self . _get_api_authorization_header ( url )
info = self . _download_json (
' %s /config/ %s / %s ' % ( self . _API_BASE_V2 , lang , video_id ) , video_id , headers = auth_data . get ( ' headers ' ) )
attributes = info . get ( ' data ' ) . get ( ' attributes ' )
metadata = attributes . get ( ' metadata ' )
streams = attributes . get ( ' streams ' )
if not streams or not metadata :
raise ExtractorError ( ' Required metadata could not be fetched ' , expected = True )
info_dict = {
' id ' : video_id ,
' title ' : self . _get_full_title ( metadata ) ,
' description ' : metadata . get ( ' description ' ) ,
' upload_date ' : self . _get_upload_date ( attributes . get ( ' rights ' ) ) ,
' thumbnail ' : self . _get_thumbnail_url ( metadata )
}
else :
self . report_warning ( ' Skipping stream with unknown protocol {0} ' . format ( stream [ " protocol " ] ) )
# TODO: chapters from stream['segments']?
# The JS also looks for chapters in config['data']['attributes']['chapters'],
# but I am yet to find a video having those
import pdb
pdb . set_trace ( )
formats . extend ( secondary_formats )
self . _remove_duplicate_formats ( formats )
qfunc = qualities ( [ ' MQ ' , ' HQ ' , ' EQ ' , ' SQ ' ] )
metadata = config [ ' data ' ] [ ' attributes ' ] [ ' metadata ' ]
LANGS = {
' fr ' : ' F ' ,
' de ' : ' A ' ,
' en ' : ' E[ANG] ' ,
' es ' : ' E[ESP] ' ,
' it ' : ' E[ITA] ' ,
' pl ' : ' E[POL] ' ,
return {
' id ' : metadata [ ' providerId ' ] ,
' webpage_url ' : traverse_obj ( metadata , ( ' link ' , ' url ' ) ) ,
' title ' : traverse_obj ( metadata , ' subtitle ' , ' title ' ) ,
' alt_title ' : metadata . get ( ' subtitle ' ) and metadata . get ( ' title ' ) ,
' description ' : metadata . get ( ' description ' ) ,
' duration ' : traverse_obj ( metadata , ( ' duration ' , ' seconds ' ) ) ,
' language ' : metadata . get ( ' language ' ) ,
' timestamp ' : traverse_obj ( config , ( ' data ' , ' attributes ' , ' rights ' , ' begin ' ) , expected_type = parse_iso8601 ) ,
' is_live ' : config [ ' data ' ] [ ' attributes ' ] . get ( ' live ' , False ) ,
' formats ' : formats ,
' subtitles ' : subtitles ,
' thumbnails ' : [
{ ' url ' : image [ ' url ' ] , ' id ' : image . get ( ' caption ' ) }
for image in metadata . get ( ' images ' ) or [ ] if url_or_none ( image . get ( ' url ' ) )
] ,
}
langcode = LANGS . get ( lang , lang )
formats = [ ]
for format_id , format_dict in vsr . items ( ) :
f = dict ( format_dict )
format_url = url_or_none ( f . get ( ' url ' ) )
streamer = f . get ( ' streamer ' )
if not format_url and not streamer :
continue
versionCode = f . get ( ' versionCode ' )
l = re . escape ( langcode )
# Language preference from most to least priority
# Reference: section 6.8 of
# https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf
PREFERENCES = (
# original version in requested language, without subtitles
r ' VO {0} $ ' . format ( l ) ,
# original version in requested language, with partial subtitles in requested language
r ' VO {0} -ST {0} $ ' . format ( l ) ,
# original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r ' VO {0} -STM {0} $ ' . format ( l ) ,
# non-original (dubbed) version in requested language, without subtitles
r ' V {0} $ ' . format ( l ) ,
# non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
r ' V {0} -ST {0} $ ' . format ( l ) ,
# non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r ' V {0} -STM {0} $ ' . format ( l ) ,
# original version in requested language, with partial subtitles in different language
r ' VO {0} -ST(?! {0} ).+?$ ' . format ( l ) ,
# original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
r ' VO {0} -STM(?! {0} ).+?$ ' . format ( l ) ,
# original version in different language, with partial subtitles in requested language
r ' VO(?:(?! {0} ).+?)?-ST {0} $ ' . format ( l ) ,
# original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
r ' VO(?:(?! {0} ).+?)?-STM {0} $ ' . format ( l ) ,
# original version in different language, without subtitles
r ' VO(?:(?! {0} ))?$ ' . format ( l ) ,
# original version in different language, with partial subtitles in different language
r ' VO(?:(?! {0} ).+?)?-ST(?! {0} ).+?$ ' . format ( l ) ,
# original version in different language, with subtitles for the deaf and hard-of-hearing in different language
r ' VO(?:(?! {0} ).+?)?-STM(?! {0} ).+?$ ' . format ( l ) ,
)
for pref , p in enumerate ( PREFERENCES ) :
if re . match ( p , versionCode ) :
lang_pref = len ( PREFERENCES ) - pref
break
else :
lang_pref = - 1
media_type = f . get ( ' mediaType ' )
if media_type == ' hls ' :
m3u8_formats = self . _extract_m3u8_formats (
format_url , video_id , ' mp4 ' , entry_protocol = ' m3u8_native ' ,
m3u8_id = format_id , fatal = False )
for m3u8_format in m3u8_formats :
m3u8_format [ ' language_preference ' ] = lang_pref
formats . extend ( m3u8_formats )
continue
format = {
' format_id ' : format_id ,
' preference ' : - 10 if f . get ( ' videoFormat ' ) == ' M3U8 ' else None ,
' language_preference ' : lang_pref ,
' format_note ' : ' %s , %s ' % ( f . get ( ' versionCode ' ) , f . get ( ' versionLibelle ' ) ) ,
' width ' : int_or_none ( f . get ( ' width ' ) ) ,
' height ' : int_or_none ( f . get ( ' height ' ) ) ,
' tbr ' : int_or_none ( f . get ( ' bitrate ' ) ) ,
' quality ' : qfunc ( f . get ( ' quality ' ) ) ,
}
if media_type == ' rtmp ' :
format [ ' url ' ] = f [ ' streamer ' ]
format [ ' play_path ' ] = ' mp4: ' + f [ ' url ' ]
format [ ' ext ' ] = ' flv '
else :
format [ ' url ' ] = f [ ' url ' ]
formats . append ( format )
self . _sort_formats ( formats )
return info_dict
# return {
# 'id': player_info.get('VID') or video_id,
# 'title': title,
# 'description': player_info.get('VDE'),
# 'upload_date': unified_strdate(upload_date_str),
# 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
# 'formats': formats,
# }
def _get_full_title ( self , metadata ) :
if metadata . get ( ' subtitle ' ) :
return ' %s - %s ' % ( metadata . get ( ' title ' ) , metadata . get ( ' subtitle ' ) )
return metadata . get ( ' title ' )
def _get_upload_date ( self , rights ) :
begin = rights . get ( ' begin ' )
if not begin :
return None
date_part = begin . split ( ' T ' ) [ 0 ]
if not date_part :
return None
start_year , start_month , start_day = date_part . split ( ' - ' )
return ' %s %s %s ' % ( start_year , start_month , start_day )
def _get_thumbnail_url ( self , metadata ) :
images = metadata . get ( ' images ' )
if not images or not images [ 0 ] or not images [ 0 ] . get ( ' url ' ) :
return None
return images [ 0 ] . get ( ' url ' )
class ArteTVEmbedIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:www \ .)?arte \ .tv/player/v \ d+/index \ .php \ ?.*? \ bjson_url=.+ '
_EMBED_REGEX = [ r ' <(?:iframe|script)[^>]+src=([ " \' ])(?P<url>(?:https?:)?//(?:www \ .)?arte \ .tv/player/v \ d+/index \ .php \ ?.*? \ bjson_url=.+?) \ 1 ' ]
_TESTS = [ {
' url ' : ' https://www.arte.tv/player/v5/index.php?json_url=https % 3A %2F %2F api.arte.tv %2F api %2F player %2F v2 %2F config %2F de %2F 100605-013-A&lang=de&autoplay=true&mute=0100605-013-A ' ,
' info_dict ' : {
@ -256,6 +239,7 @@ class ArteTVEmbedIE(InfoExtractor):
' description ' : ' md5:be40b667f45189632b78c1425c7c2ce1 ' ,
' upload_date ' : ' 20201116 ' ,
} ,
' skip ' : ' Video is not available in this language edition of Arte or broadcast rights expired '
} , {
' url ' : ' https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A ' ,
' only_matching ' : True ,
@ -268,7 +252,7 @@ class ArteTVEmbedIE(InfoExtractor):
webpage ) ]
def _real_extract ( self , url ) :
qs = compat_urlparse. parse_qs( compat_ urlparse. urlparse ( url ) . query )
qs = parse_qs( url)
json_url = qs [ ' json_url ' ] [ 0 ]
video_id = ArteTVIE . _match_id ( json_url )
return self . url_result (
@ -279,41 +263,83 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
_VALID_URL = r ' https?://(?:www \ .)?arte \ .tv/(?P<lang> %s )/videos/(?P<id>RC- \ d {6} ) ' % ArteTVBaseIE . _ARTE_LANGUAGES
_TESTS = [ {
' url ' : ' https://www.arte.tv/en/videos/RC-016954/earn-a-living/ ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.arte.tv/pl/videos/RC-014123/arte-reportage/ ' ,
' playlist_mincount ' : 100 ,
' info_dict ' : {
' id ' : ' RC-016954 ' ,
' title ' : ' Earn a Living ' ,
' description ' : ' md5:d322c55011514b3a7241f7fb80d494c2 ' ,
' description' : ' md5:84e7bf1feda248bc325ebfac818c476e ' ,
' id' : ' RC-014123 ' ,
' title' : ' ARTE Reportage - najlepsze reportaże ' ,
} ,
' playlist_mincount' : 6 ,
' skip' : ' 404 Not Found ' ,
} , {
' url ' : ' https://www.arte.tv/pl/videos/RC-014123/arte-reportage/ ' ,
' only_matching ' : True ,
' url ' : ' https://www.arte.tv/en/videos/RC-016979/war-in-ukraine/ ' ,
' playlist_mincount ' : 79 ,
' info_dict ' : {
' id ' : ' RC-016979 ' ,
' title ' : ' War in Ukraine ' ,
' description ' : ' On 24 February, Russian armed forces invaded Ukraine. We follow the war day by day and provide background information with special insights, reports and documentaries. ' ,
} ,
} ]
def _real_extract ( self , url ) :
lang , playlist_id = re . match ( self . _VALID_URL , url ) . groups ( )
collection = self . _download_json (
' %s /collectionData/ %s / %s ?source=videos '
% ( self . _API_BASE_V1 , lang , playlist_id ) , playlist_id )
entries = [ ]
for video in collection [ ' videos ' ] :
if not isinstance ( video , dict ) :
continue
video_url = url_or_none ( video . get ( ' url ' ) ) or url_or_none ( video . get ( ' jsonUrl ' ) )
if not video_url :
lang , playlist_id = self . _match_valid_url ( url ) . group ( ' lang ' , ' id ' )
playlist = self . _download_json (
' {0} /playlist/ {1} / {2} ' . format ( self . _API_BASE , lang , playlist_id ) , playlist_id ) [ ' data ' ] [ ' attributes ' ]
entries = [ {
' _type ' : ' url_transparent ' ,
' url ' : video [ ' config ' ] [ ' url ' ] ,
' ie_key ' : ArteTVIE . ie_key ( ) ,
' id ' : video . get ( ' providerId ' ) ,
' title ' : video . get ( ' title ' ) ,
' alt_title ' : video . get ( ' subtitle ' ) ,
' thumbnail ' : url_or_none ( traverse_obj ( video , ( ' mainImage ' , ' url ' ) ) ) ,
' duration ' : int_or_none ( traverse_obj ( video , ( ' duration ' , ' seconds ' ) ) ) ,
} for video in traverse_obj ( playlist , ( ' items ' , lambda _ , v : v [ ' config ' ] [ ' url ' ] ) ) ]
return self . playlist_result ( entries , playlist_id ,
traverse_obj ( playlist , ( ' metadata ' , ' title ' ) ) ,
traverse_obj ( playlist , ( ' metadata ' , ' description ' ) ) )
class ArteTVCategoryIE ( ArteTVBaseIE ) :
_VALID_URL = r ' https?://(?:www \ .)?arte \ .tv/(?P<lang> %s )/videos/(?P<id>[ \ w-]+(?:/[ \ w-]+)*)/? \ s*$ ' % ArteTVBaseIE . _ARTE_LANGUAGES
_TESTS = [ {
' url ' : ' https://www.arte.tv/en/videos/politics-and-society/ ' ,
' info_dict ' : {
' id ' : ' politics-and-society ' ,
' title ' : ' Politics and society ' ,
' description ' : ' Watch documentaries and reportage about politics, society and current affairs. ' ,
} ,
' playlist_mincount ' : 13 ,
} ]
@classmethod
def suitable ( cls , url ) :
return (
not any ( ie . suitable ( url ) for ie in ( ArteTVIE , ArteTVPlaylistIE , ) )
and super ( ArteTVCategoryIE , cls ) . suitable ( url ) )
def _real_extract ( self , url ) :
lang , playlist_id = self . _match_valid_url ( url ) . groups ( )
webpage = self . _download_webpage ( url , playlist_id )
items = [ ]
for video in re . finditer (
r ' <a \ b[^>]+ \ bhref \ s*= \ s*(?P<q> " | \' | \ b)(?P<url>(?:https?://www \ .arte \ .tv)?/ %s /videos/[ \ w/-]+)(?P=q) ' % lang ,
webpage ) :
video = urljoin ( url , video . group ( ' url ' ) )
if video == url :
continue
video_id = video . get ( ' programId ' )
entries . append ( {
' _type ' : ' url_transparent ' ,
' url ' : video_url ,
' id ' : video_id ,
' title ' : video . get ( ' title ' ) ,
' alt_title ' : video . get ( ' subtitle ' ) ,
' thumbnail ' : url_or_none ( try_get ( video , lambda x : x [ ' mainImage ' ] [ ' url ' ] , compat_str ) ) ,
' duration ' : int_or_none ( video . get ( ' durationSeconds ' ) ) ,
' view_count ' : int_or_none ( video . get ( ' views ' ) ) ,
' ie_key ' : ArteTVIE . ie_key ( ) ,
} )
title = collection . get ( ' title ' )
description = collection . get ( ' shortDescription ' ) or collection . get ( ' teaserText ' )
return self . playlist_result ( entries , playlist_id , title , description )
if any ( ie . suitable ( video ) for ie in ( ArteTVIE , ArteTVPlaylistIE , ) ) :
items . append ( video )
title = ( self . _og_search_title ( webpage , default = None )
or self . _html_search_regex ( r ' <title \ b[^>]*>([^<]+)</title> ' , default = None ) )
title = strip_or_none ( title . rsplit ( ' | ' , 1 ) [ 0 ] ) or self . _generic_title ( url )
return merge_dicts (
self . playlist_from_matches ( items , playlist_id = playlist_id , playlist_title = title ) ,
{ ' description ' : self . _og_search_description ( webpage , default = None ) } )