@ -1,4 +1,5 @@
import base64
import functools
import json
import re
import time
@ -6,17 +7,24 @@ import urllib.parse
import xml . etree . ElementTree
from . common import InfoExtractor
from . . networking import HEADRequest
from . . utils import (
ExtractorError ,
float_or_none ,
int_or_none ,
join_nonempty ,
js_to_json ,
mimetype2ext ,
orderedSet ,
parse_iso8601 ,
replace_extension ,
smuggle_url ,
strip_or_none ,
traverse_obj ,
try_get ,
update_url ,
url_basename ,
url_or_none ,
)
@ -149,6 +157,7 @@ class CBCIE(InfoExtractor):
class CBCPlayerIE ( InfoExtractor ) :
IE_NAME = ' cbc.ca:player '
_VALID_URL = r ' (?:cbcplayer:|https?://(?:www \ .)?cbc \ .ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/ \ ?mediaId=))(?P<id>(?: \ d \ .)? \ d+) '
_GEO_COUNTRIES = [ ' CA ' ]
_TESTS = [ {
' url ' : ' http://www.cbc.ca/player/play/2683190193 ' ,
' md5 ' : ' 64d25f841ddf4ddb28a235338af32e2c ' ,
@ -172,21 +181,20 @@ class CBCPlayerIE(InfoExtractor):
' description ' : ' md5:dd3b692f0a139b0369943150bd1c46a9 ' ,
' timestamp ' : 1425704400 ,
' upload_date ' : ' 20150307 ' ,
' uploader ' : ' CBCC-NEW ' ,
' thumbnail ' : ' http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg ' ,
' thumbnail ' : ' https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg ' ,
' chapters ' : [ ] ,
' duration ' : 494.811 ,
' categories ' : [ ' A udioMobile/A ll in a Weekend Montreal' ] ,
' tags ' : ' count: 8 ' ,
' categories ' : [ ' A ll in a Weekend Montreal' ] ,
' tags ' : ' count: 11 ' ,
' location ' : ' Quebec ' ,
' series ' : ' All in a Weekend Montreal ' ,
' season ' : ' Season 2015 ' ,
' season_number ' : 2015 ,
' media_type ' : ' Excerpt ' ,
' genres ' : [ ' Other ' ] ,
} ,
} , {
' url ' : ' http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062 ' ,
' md5 ' : ' 33fcd8f6719b9dd60a5e73adcb83b9f6 ' ,
' info_dict ' : {
' id ' : ' 2164402062 ' ,
' ext ' : ' mp4 ' ,
@ -194,107 +202,168 @@ class CBCPlayerIE(InfoExtractor):
' description ' : ' Tim Mayer has beaten three different forms of cancer four times in five years. ' ,
' timestamp ' : 1320410746 ,
' upload_date ' : ' 20111104 ' ,
' uploader ' : ' CBCC-NEW ' ,
' thumbnail ' : ' https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg ' ,
' thumbnail ' : ' https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg ' ,
' chapters ' : [ ] ,
' duration ' : 186.867 ,
' series ' : ' CBC News: Windsor at 6:00 ' ,
' categories ' : [ ' News/Canada/ Windsor' ] ,
' categories ' : [ ' Windsor' ] ,
' location ' : ' Windsor ' ,
' tags ' : [ ' cancer ' ] ,
' creators ' : [ ' Allison Johnson ' ] ,
' tags ' : [ ' Cancer ' , ' News/Canada/Windsor ' , ' Windsor ' ] ,
' media_type ' : ' Excerpt ' ,
' genres ' : [ ' News ' ] ,
} ,
' params ' : { ' skip_download ' : ' m3u8 ' } ,
} , {
# Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
' url ' : ' https://www.cbc.ca/player/play/1.2985700 ' ,
' md5 ' : ' e5e708c34ae6fca156aafe17c43e8b75 ' ,
' info_dict ' : {
' id ' : ' 2657631896 ' ,
' id ' : ' 1.2985700 ' ,
' ext ' : ' mp3 ' ,
' title ' : ' CBC Montreal is organizing its first ever community hackathon! ' ,
' description ' : ' The modern technology we tend to depend on so heavily, is never without it \' s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon. ' ,
' timestamp ' : 1425704400 ,
' upload_date ' : ' 20150307 ' ,
' uploader ' : ' CBCC-NEW ' ,
' thumbnail ' : ' http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg ' ,
' thumbnail ' : ' https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg ' ,
' chapters ' : [ ] ,
' duration ' : 494.811 ,
' categories ' : [ ' A udioMobile/A ll in a Weekend Montreal' ] ,
' tags ' : ' count: 8 ' ,
' categories ' : [ ' A ll in a Weekend Montreal' ] ,
' tags ' : ' count: 11 ' ,
' location ' : ' Quebec ' ,
' series ' : ' All in a Weekend Montreal ' ,
' season ' : ' Season 2015 ' ,
' season_number ' : 2015 ,
' media_type ' : ' Excerpt ' ,
' genres ' : [ ' Other ' ] ,
} ,
} , {
' url ' : ' https://www.cbc.ca/player/play/1.1711287 ' ,
' md5 ' : ' 33fcd8f6719b9dd60a5e73adcb83b9f6 ' ,
' info_dict ' : {
' id ' : ' 2164402062 ' ,
' id ' : ' 1.1711287 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Cancer survivor four times over ' ,
' description ' : ' Tim Mayer has beaten three different forms of cancer four times in five years. ' ,
' timestamp ' : 1320410746 ,
' upload_date ' : ' 20111104 ' ,
' uploader ' : ' CBCC-NEW ' ,
' thumbnail ' : ' https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg ' ,
' thumbnail ' : ' https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg ' ,
' chapters ' : [ ] ,
' duration ' : 186.867 ,
' series ' : ' CBC News: Windsor at 6:00 ' ,
' categories ' : [ ' News/Canada/ Windsor' ] ,
' categories ' : [ ' Windsor' ] ,
' location ' : ' Windsor ' ,
' tags ' : [ ' cancer ' ] ,
' creators ' : [ ' Allison Johnson ' ] ,
' tags ' : [ ' Cancer ' , ' News/Canada/Windsor ' , ' Windsor ' ] ,
' media_type ' : ' Excerpt ' ,
' genres ' : [ ' News ' ] ,
} ,
' params ' : { ' skip_download ' : ' m3u8 ' } ,
} , {
# Has subtitles
# These broadcasts expire after ~1 month, can find new test URL here:
# https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
' url ' : ' https://www.cbc.ca/player/play/ 1.7159484 ' ,
' md5 ' : ' 6ed6cd0fc2ef568d2297ba68a763d455 ' ,
' url ' : ' https://www.cbc.ca/player/play/ video/9.6424403 ' ,
' md5 ' : ' 8025909eaffcf0adf59922904def9a5e ' ,
' info_dict ' : {
' id ' : ' 2324213316001 ' ,
' id ' : ' 9.6424403 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The National | School boards sue social media giants ' ,
' description ' : ' md5:4b4db69322fa32186c3ce426da07402c ' ,
' timestamp ' : 1711681200 ,
' duration ' : 2743.400 ,
' subtitles ' : { ' eng ' : [ { ' ext ' : ' vtt ' , ' protocol ' : ' m3u8_native ' } ] } ,
' thumbnail ' : ' https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg ' ,
' uploader ' : ' CBCC-NEW ' ,
' title ' : ' The National | N.W.T. wildfire emergency ' ,
' description ' : ' md5:ada33d36d1df69347ed575905bfd496c ' ,
' timestamp ' : 1718589600 ,
' duration ' : 2692.833 ,
' subtitles ' : {
' en-US ' : [ {
' name ' : ' English Captions ' ,
' url ' : ' https://cbchls.akamaized.net/delivery/news-shows/2024/06/17/NAT_JUN16-00-55-00/NAT_JUN16_cc.vtt ' ,
} ] ,
} ,
' thumbnail ' : ' https://i.cbc.ca/ais/6272b5c6-5e78-4c05-915d-0e36672e33d1,1714756287822/full/max/0/default.jpg ' ,
' chapters ' : ' count:5 ' ,
' upload_date ' : ' 20240329 ' ,
' categories ' : ' count:4 ' ,
' upload_date ' : ' 20240 617 ' ,
' categories ' : [ ' News ' , ' The National ' , ' The National Latest Broadcasts ' ] ,
' series ' : ' The National - Full Show ' ,
' tags ' : ' count:1 ' ,
' creators ' : [ ' News ' ] ,
' tags ' : [ ' The National ' ] ,
' location ' : ' Canada ' ,
' media_type ' : ' Full Program ' ,
' genres ' : [ ' News ' ] ,
} ,
} , {
' url ' : ' https://www.cbc.ca/player/play/video/1.7194274 ' ,
' md5 ' : ' 188b96cf6bdcb2540e178a6caa957128 ' ,
' info_dict ' : {
' id ' : ' 2334524995812 ' ,
' id ' : ' 1.7194274 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' #TheMoment a rare white spirit moose was spotted in Alberta ' ,
' description ' : ' md5:18ae269a2d0265c5b0bbe4b2e1ac61a3 ' ,
' timestamp ' : 1714788791 ,
' duration ' : 77.678 ,
' subtitles ' : { ' eng ' : [ { ' ext ' : ' vtt ' , ' protocol ' : ' m3u8_native ' } ] } ,
' thumbnail ' : ' https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg ' ,
' uploader ' : ' CBCC-NEW ' ,
' chapters ' : ' count:0 ' ,
' upload_date ' : ' 20240504 ' ,
' thumbnail ' : ' https://i.cbc.ca/ais/1.7194274,1717224990425/full/max/0/default.jpg ' ,
' chapters ' : [ ] ,
' categories ' : ' count:3 ' ,
' series ' : ' The National ' ,
' tags ' : ' count:15 ' ,
' creators ' : [ ' encoder ' ] ,
' tags ' : ' count:17 ' ,
' location ' : ' Canada ' ,
' media_type ' : ' Excerpt ' ,
' upload_date ' : ' 20240504 ' ,
' genres ' : [ ' News ' ] ,
} ,
} , {
' url ' : ' https://www.cbc.ca/player/play/video/9.6427282 ' ,
' info_dict ' : {
' id ' : ' 9.6427282 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Men \' s Soccer - Argentina vs Morocco ' ,
' description ' : ' Argentina faces Morocco on the football pitch at Saint Etienne Stadium. ' ,
' series ' : ' CBC Sports ' ,
' media_type ' : ' Event Coverage ' ,
' thumbnail ' : ' https://i.cbc.ca/ais/a4c5c0c2-99fa-4bd3-8061-5a63879c1b33,1718828053500/full/max/0/default.jpg ' ,
' timestamp ' : 1721825400.0 ,
' upload_date ' : ' 20240724 ' ,
' duration ' : 10568.0 ,
' chapters ' : [ ] ,
' genres ' : [ ] ,
' tags ' : [ ' 2024 Paris Olympic Games ' ] ,
' categories ' : [ ' Olympics Summer Soccer ' , ' Summer Olympics Replays ' , ' Summer Olympics Soccer Replays ' ] ,
' location ' : ' Canada ' ,
} ,
' params ' : { ' skip_download ' : ' m3u8 ' } ,
} , {
' url ' : ' https://www.cbc.ca/player/play/video/9.6459530 ' ,
' md5 ' : ' 6c1bb76693ab321a2e99c347a1d5ecbc ' ,
' info_dict ' : {
' id ' : ' 9.6459530 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Parts of Jasper incinerated as wildfire rages ' ,
' description ' : ' md5:6f1caa8d128ad3f629257ef5fecf0962 ' ,
' series ' : ' The National ' ,
' media_type ' : ' Excerpt ' ,
' thumbnail ' : ' https://i.cbc.ca/ais/507c0086-31a2-494d-96e4-bffb1048d045,1721953984375/full/max/0/default.jpg ' ,
' timestamp ' : 1721964091.012 ,
' upload_date ' : ' 20240726 ' ,
' duration ' : 952.285 ,
' chapters ' : [ ] ,
' genres ' : [ ] ,
' tags ' : ' count:23 ' ,
' categories ' : [ ' News (FAST) ' , ' News ' , ' The National ' , ' TV News Shows ' , ' The National ' ] ,
} ,
} , {
' url ' : ' https://www.cbc.ca/player/play/video/9.6420651 ' ,
' md5 ' : ' 71a850c2c6ee5e912de169f5311bb533 ' ,
' info_dict ' : {
' id ' : ' 9.6420651 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Is it a breath of fresh air? Measuring air quality in Edmonton ' ,
' description ' : ' md5:3922b92cc8b69212d739bd9dd095b1c3 ' ,
' series ' : ' CBC News Edmonton ' ,
' media_type ' : ' Excerpt ' ,
' thumbnail ' : ' https://i.cbc.ca/ais/73c4ab9c-7ad4-46ee-bb9b-020fdc01c745,1718214547576/full/max/0/default.jpg ' ,
' timestamp ' : 1718220065.768 ,
' upload_date ' : ' 20240612 ' ,
' duration ' : 286.086 ,
' chapters ' : [ ] ,
' genres ' : [ ' News ' ] ,
' categories ' : [ ' News ' , ' Edmonton ' ] ,
' tags ' : ' count:7 ' ,
' location ' : ' Edmonton ' ,
} ,
} , {
' url ' : ' cbcplayer:1.7159484 ' ,
@ -307,25 +376,115 @@ class CBCPlayerIE(InfoExtractor):
' only_matching ' : True ,
} ]
def _parse_param ( self , asset_data , name ) :
return traverse_obj ( asset_data , ( ' params ' , lambda _ , v : v [ ' name ' ] == name , ' value ' , { str } , any ) )
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
if ' . ' in video_id :
webpage = self . _download_webpage ( f ' https://www.cbc.ca/player/play/ { video_id } ' , video_id )
video_id = self . _search_json (
r ' window \ .__INITIAL_STATE__ \ s*= ' , webpage ,
' initial state ' , video_id ) [ ' video ' ] [ ' currentClip ' ] [ ' mediaId ' ]
data = self . _search_json (
r ' window \ .__INITIAL_STATE__ \ s*= ' , webpage , ' initial state ' , video_id ) [ ' video ' ] [ ' currentClip ' ]
assets = traverse_obj (
data , ( ' media ' , ' assets ' , lambda _ , v : url_or_none ( v [ ' key ' ] ) and v [ ' type ' ] ) )
if not assets and ( media_id := traverse_obj ( data , ( ' mediaId ' , { str } ) ) ) :
# XXX: Deprecated; CBC is migrating off of ThePlatform
return {
' _type ' : ' url_transparent ' ,
' ie_key ' : ' ThePlatform ' ,
' url ' : smuggle_url (
f ' http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/ { video _id} ?mbr=true&formats=MPEG4,FLV,MP3 ' , {
f ' http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/ { media _id} ?mbr=true&formats=MPEG4,FLV,MP3 ' , {
' force_smil_url ' : True ,
} ) ,
' id ' : video _id,
' id ' : media _id,
' _format_sort_fields ' : ( ' res ' , ' proto ' ) , # Prioritize direct http formats over HLS
}
is_live = traverse_obj ( data , ( ' media ' , ' streamType ' , { str } ) ) == ' Live '
formats , subtitles = [ ] , { }
for sub in traverse_obj ( data , ( ' media ' , ' textTracks ' , lambda _ , v : url_or_none ( v [ ' src ' ] ) ) ) :
subtitles . setdefault ( sub . get ( ' language ' ) or ' und ' , [ ] ) . append ( {
' url ' : sub [ ' src ' ] ,
' name ' : sub . get ( ' label ' ) ,
} )
for asset in assets :
asset_key = asset [ ' key ' ]
asset_type = asset [ ' type ' ]
if asset_type != ' medianet ' :
self . report_warning ( f ' Skipping unsupported asset type " { asset_type } " : { asset_key } ' )
continue
asset_data = self . _download_json ( asset_key , video_id , f ' Downloading { asset_type } JSON ' )
ext = mimetype2ext ( self . _parse_param ( asset_data , ' contentType ' ) )
if ext == ' m3u8 ' :
fmts , subs = self . _extract_m3u8_formats_and_subtitles (
asset_data [ ' url ' ] , video_id , ' mp4 ' , m3u8_id = ' hls ' , live = is_live )
formats . extend ( fmts )
# Avoid slow/error-prone webvtt-over-m3u8 if direct https vtt is available
if not subtitles :
self . _merge_subtitles ( subs , target = subtitles )
if is_live or not fmts :
continue
# Check for direct https mp4 format
best_video_fmt = traverse_obj ( fmts , (
lambda _ , v : v . get ( ' vcodec ' ) != ' none ' and v [ ' tbr ' ] , all ,
{ functools . partial ( sorted , key = lambda x : x [ ' tbr ' ] ) } , - 1 , { dict } ) ) or { }
base_url = self . _search_regex (
r ' (https?://[^?#]+?/)hdntl= ' , best_video_fmt . get ( ' url ' ) , ' base url ' , default = None )
if not base_url or ' /live/ ' in base_url :
continue
mp4_url = base_url + replace_extension ( url_basename ( best_video_fmt [ ' url ' ] ) , ' mp4 ' )
if self . _request_webpage (
HEADRequest ( mp4_url ) , video_id , ' Checking for https format ' ,
errnote = False , fatal = False ) :
formats . append ( {
* * best_video_fmt ,
' url ' : mp4_url ,
' format_id ' : ' https-mp4 ' ,
' protocol ' : ' https ' ,
' manifest_url ' : None ,
' acodec ' : None ,
} )
else :
formats . append ( {
' url ' : asset_data [ ' url ' ] ,
' ext ' : ext ,
' vcodec ' : ' none ' if self . _parse_param ( asset_data , ' mediaType ' ) == ' audio ' else None ,
} )
chapters = traverse_obj ( data , (
' media ' , ' chapters ' , lambda _ , v : float ( v [ ' startTime ' ] ) is not None , {
' start_time ' : ( ' startTime ' , { functools . partial ( float_or_none , scale = 1000 ) } ) ,
' end_time ' : ( ' endTime ' , { functools . partial ( float_or_none , scale = 1000 ) } ) ,
' title ' : ( ' name ' , { str } ) ,
} ) )
# Filter out pointless single chapters with start_time==0 and no end_time
if len ( chapters ) == 1 and not ( chapters [ 0 ] . get ( ' start_time ' ) or chapters [ 0 ] . get ( ' end_time ' ) ) :
chapters = [ ]
return {
* * traverse_obj ( data , {
' title ' : ( ' title ' , { str } ) ,
' description ' : ( ' description ' , { str . strip } ) ,
' thumbnail ' : ( ' image ' , ' url ' , { url_or_none } , { functools . partial ( update_url , query = None ) } ) ,
' timestamp ' : ( ' publishedAt ' , { functools . partial ( float_or_none , scale = 1000 ) } ) ,
' media_type ' : ( ' media ' , ' clipType ' , { str } ) ,
' series ' : ( ' showName ' , { str } ) ,
' season_number ' : ( ' media ' , ' season ' , { int_or_none } ) ,
' duration ' : ( ' media ' , ' duration ' , { float_or_none } , { lambda x : None if is_live else x } ) ,
' location ' : ( ' media ' , ' region ' , { str } ) ,
' tags ' : ( ' tags ' , . . . , ' name ' , { str } ) ,
' genres ' : ( ' media ' , ' genre ' , all ) ,
' categories ' : ( ' categories ' , . . . , ' name ' , { str } ) ,
} ) ,
' id ' : video_id ,
' formats ' : formats ,
' subtitles ' : subtitles ,
' chapters ' : chapters ,
' is_live ' : is_live ,
}
class CBCPlayerPlaylistIE ( InfoExtractor ) :
IE_NAME = ' cbc.ca:player:playlist '