@ -1,22 +1,27 @@
import base64
import json
import re
import urllib . parse
from . common import InfoExtractor
from . . utils import js_to_json
from . . utils import ExtractorError , determine_ext , join_nonempty
def decode_b64_url ( code ) :
decoded_url = re . match ( r ' [^[]* \ [([^]]*) \ ] ' , code ) . groups ( ) [ 0 ]
return base64 . b64decode (
urllib . parse . unquote ( re . sub ( r ' [ \ s " \' ,] ' , ' ' , decoded_url ) ) ,
) . decode ( ' utf-8 ' )
class RTPIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:www \ .)?rtp \ .pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+) '
_VALID_URL = r ' https?://(?: (?:(?: www\ .)?rtp \ .pt/play/(? P<subarea>.*/)?p(?P<program_id>[0-9]+)/(?P<episode_id>e[0-9]+/)?)|(?:arquivos\ .rtp \ .pt/conteudos/))(?P<id>[^/?#]+)/? '
_TESTS = [ {
' url ' : ' http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas ' ,
' md5 ' : ' e736ce0c665e459ddb818546220b4ef8 ' ,
' url ' : ' https://www.rtp.pt/play/p9165/e562949/por-do-sol ' ,
' info_dict ' : {
' id ' : ' e174042 ' ,
' ext ' : ' mp 3 ' ,
' title ' : ' P aixões Cruzadas ' ,
' description ' : ' As paixões musicais de António Cartaxo e António Macedo ' ,
' id ' : ' por-do-sol ' ,
' ext ' : ' mp 4 ' ,
' title ' : ' P ôr do Sol Episódio 1 - de 16 Ago 2021 ' ,
' description ' : ' Madalena Bourbon de Linhaça vive atormentada pelo segredo que esconde desde 1990. Matilde Bourbon de Linhaça sonha fugir com o seu amor proibido. O en ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
} ,
} , {
@ -30,76 +35,82 @@ class RTPIE(InfoExtractor):
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
} ,
} , {
' url ' : ' http ://www.rtp.pt/play/p831/a-quimica-das-coisas' ,
' url ' : ' http s ://www.rtp.pt/play/p831/e205093 /a-quimica-das-coisas' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.rtp.pt/play/estudoemcasa/p7776/ portugues-1-ano' ,
' url ' : ' https://www.rtp.pt/play/estudoemcasa/p7776/ e500050/ portugues-1-ano' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.rtp.pt/play/palco/p13785/l7nnon ' ,
' url ' : ' https://www.rtp.pt/play/palco/p9138/jose-afonso-traz-um-amigo-tambem ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.rtp.pt/play/p510/e798152/aleixo-fm ' ,
' only_matching ' : True ,
} ]
_RX_OBFUSCATION = re . compile ( r ''' (?xs)
atob \s * \( \s * decodeURIComponent \s * \( \s *
( \[ [ 0 - 9 A - Za - z % , ' " ]* \ ])
\s * \. \s * join \( \s * ( ? : " " | ' ' ) \s * \) \s * \) \s * \)
''' )
def __unobfuscate ( self , data , * , video_id ) :
if data . startswith ( ' { ' ) :
data = self . _RX_OBFUSCATION . sub (
lambda m : json . dumps (
base64 . b64decode ( urllib . parse . unquote (
' ' . join ( self . _parse_json ( m . group ( 1 ) , video_id ) ) ,
) ) . decode ( ' iso-8859-1 ' ) ) ,
data )
return js_to_json ( data )
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
title = self . _html_search_meta (
' twitter:title ' , webpage , display_name = ' title ' , fatal = True )
f , config = self . _search_regex (
r ''' (?sx)
( ? : var \s + f \s * = \s * ( ? P < f > " .*? " | { [ ^ ; ] + ? } ) ; \s * ) ?
var \s + player1 \s + = \s + new \s + RTPPlayer \s * \( ( ? P < config > { ( ? : ( ? ! \* / ) . ) + ? } ) \) ; ( ? ! \s * \* / )
''' , webpage,
' player config ' , group = ( ' f ' , ' config ' ) )
config = self . _parse_json (
config , video_id ,
lambda data : self . __unobfuscate ( data , video_id = video_id ) )
f = config [ ' file ' ] if not f else self . _parse_json (
f , video_id ,
lambda data : self . __unobfuscate ( data , video_id = video_id ) )
formats = [ ]
if isinstance ( f , dict ) :
f_hls = f . get ( ' hls ' )
if f_hls is not None :
formats . extend ( self . _extract_m3u8_formats (
f_hls , video_id , ' mp4 ' , ' m3u8_native ' , m3u8_id = ' hls ' ) )
f_dash = f . get ( ' dash ' )
if f_dash is not None :
formats . extend ( self . _extract_mpd_formats ( f_dash , video_id , mpd_id = ' dash ' ) )
# Remove comments from webpage source
webpage = re . sub ( r ' (?s)/ \ *.* \ */ ' , ' ' , webpage )
webpage = re . sub ( r ' (?m)(?:^| \ s)//.*$ ' , ' ' , webpage )
title = self . _html_search_regex ( r ' <title>(.+?)</title> ' , webpage , ' title ' , default = ' ' )
# Replace irrelevant text in title
title = title . replace ( ' - RTP Play - RTP ' , ' ' ) or self . _html_search_meta ( ' twitter:title ' , webpage )
if ' Este episódio não se encontra disponí ' in title :
raise ExtractorError ( ' Episode unavailable ' , expected = True )
part = self . _html_search_regex ( r ' section \ -parts.*<span.*>(.+?)</span>.*</ul> ' , webpage , ' part ' , default = None )
title = join_nonempty ( title , part , delim = ' ' )
# Get file key
file_key = self . _search_regex ( r ' \ s*fileKey: " ([^ " ]+) " , ' , webpage , ' file key - open ' , default = None )
if file_key is None :
self . write_debug ( ' url: obfuscated ' )
file_key = self . _search_regex ( r ' \ s*fileKey: atob \ ( decodeURIComponent \ ((.*) \ ) \ ) \ ), ' , webpage , ' file key ' )
url = decode_b64_url ( file_key ) or ' '
else :
formats . append ( {
' format_id ' : ' f ' ,
' url ' : f ,
' vcodec ' : ' none ' if config . get ( ' mediaType ' ) == ' audio ' else None ,
} )
self . write_debug ( ' url: clean ' )
url = file_key
subtitles = { }
if ' mp3 ' in url :
full_url = ' https://cdn-ondemand.rtp.pt ' + url
elif ' mp4 ' in url :
full_url = f ' https://streaming-vod.rtp.pt/dash { url } /manifest.mpd '
else :
full_url = None
if not full_url :
raise ExtractorError ( ' No valid media source found in page ' )
poster = self . _search_regex ( r ' \ s*poster: " ([^ " ]+) " ' , webpage , ' poster ' , fatal = False )
vtt = config . get ( ' vtt ' )
# Finally send pure JSON string for JSON parsing
full_url = full_url . replace ( ' drm-dash ' , ' dash ' )
ext = determine_ext ( full_url )
if ext == ' mpd ' :
# Download via mpd file
self . write_debug ( ' formats: mpd ' )
formats = self . _extract_mpd_formats ( full_url , video_id )
else :
self . write_debug ( ' formats: ext= {ext} ' )
formats = [ {
' url ' : full_url ,
' ext ' : ext ,
} ]
subtitles = { }
vtt = self . _search_regex ( r ' \ s*vtt: (.*]]), \ s+ ' , webpage , ' vtt ' , default = None )
if vtt is not None :
for lcode , lname , url in vtt :
subtitles . setdefault ( lcode , [ ] ) . append ( {
vtt_object = self . _parse_json ( vtt . replace ( " ' " , ' " ' ) , full_url )
self . write_debug ( f ' vtt: { len ( vtt_object ) } subtitles ' )
for lcode , lname , url in vtt_object :
subtitles . setdefault ( lcode . lower ( ) , [ ] ) . append ( {
' name ' : lname ,
' url ' : url ,
} )
@ -109,6 +120,6 @@ class RTPIE(InfoExtractor):
' title ' : title ,
' formats ' : formats ,
' description ' : self . _html_search_meta ( [ ' description ' , ' twitter:description ' ] , webpage ) ,
' thumbnail ' : config. get ( ' poster' ) or self . _og_search_thumbnail ( webpage ) ,
' thumbnail ' : poster or self . _og_search_thumbnail ( webpage ) ,
' subtitles ' : subtitles ,
}