@ -9,10 +9,6 @@ from ..utils import (
lowercase_escape ,
lowercase_escape ,
smuggle_url ,
smuggle_url ,
unescapeHTML ,
unescapeHTML ,
update_url_query ,
int_or_none ,
HEADRequest ,
parse_iso8601 ,
)
)
@ -192,9 +188,9 @@ class CSNNEIE(InfoExtractor):
class NBCNewsIE ( ThePlatformIE ) :
class NBCNewsIE ( ThePlatformIE ) :
_VALID_URL = r ''' (?x)https?://(?:www \ .)?(?:nbcnews|today )\ .com/
_VALID_URL = r ''' (?x)https?://(?:www \ .)?(?:nbcnews|today |msnbc )\ .com/
( ? : video / . + ? / ( ? P < id > \d + ) |
( ? : video / . + ? / ( ? P < id > \d + ) |
( [ ^ / ] + / ) * ( ? P < display _id> [ ^ / ? ] + ) )
( [ ^ / ] + / ) * ( ? : . * - ) ? ( ? P < mpx _id> [ ^ / ? ] + ) )
'''
'''
_TESTS = [
_TESTS = [
@ -216,13 +212,16 @@ class NBCNewsIE(ThePlatformIE):
' ext ' : ' mp4 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' How Twitter Reacted To The Snowden Interview ' ,
' title ' : ' How Twitter Reacted To The Snowden Interview ' ,
' description ' : ' md5:65a0bd5d76fe114f3c2727aa3a81fe64 ' ,
' description ' : ' md5:65a0bd5d76fe114f3c2727aa3a81fe64 ' ,
' uploader ' : ' NBCU-NEWS ' ,
' timestamp ' : 1401363060 ,
' upload_date ' : ' 20140529 ' ,
} ,
} ,
} ,
} ,
{
{
' url ' : ' http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156 ' ,
' url ' : ' http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156 ' ,
' md5 ' : ' fdbf39ab73a72df5896b6234ff98518a ' ,
' md5 ' : ' fdbf39ab73a72df5896b6234ff98518a ' ,
' info_dict ' : {
' info_dict ' : {
' id ' : ' Wjf9EDR3A_60 ' ,
' id ' : ' 529953347624 ' ,
' ext ' : ' mp4 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' FULL EPISODE: Family Business ' ,
' title ' : ' FULL EPISODE: Family Business ' ,
' description ' : ' md5:757988edbaae9d7be1d585eb5d55cc04 ' ,
' description ' : ' md5:757988edbaae9d7be1d585eb5d55cc04 ' ,
@ -237,6 +236,9 @@ class NBCNewsIE(ThePlatformIE):
' ext ' : ' mp4 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Nightly News with Brian Williams Full Broadcast (February 4) ' ,
' title ' : ' Nightly News with Brian Williams Full Broadcast (February 4) ' ,
' description ' : ' md5:1c10c1eccbe84a26e5debb4381e2d3c5 ' ,
' description ' : ' md5:1c10c1eccbe84a26e5debb4381e2d3c5 ' ,
' timestamp ' : 1423104900 ,
' uploader ' : ' NBCU-NEWS ' ,
' upload_date ' : ' 20150205 ' ,
} ,
} ,
} ,
} ,
{
{
@ -245,10 +247,12 @@ class NBCNewsIE(ThePlatformIE):
' info_dict ' : {
' info_dict ' : {
' id ' : ' 529953347624 ' ,
' id ' : ' 529953347624 ' ,
' ext ' : ' mp4 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Volkswagen U.S. Chief: We \' Totally Screwed Up \' ' ,
' title ' : ' Volkswagen U.S. Chief: \xa0 We Have Totally Screwed Up ' ,
' description ' : ' md5:d22d1281a24f22ea0880741bb4dd6301 ' ,
' description ' : ' md5:c8be487b2d80ff0594c005add88d8351 ' ,
' upload_date ' : ' 20150922 ' ,
' timestamp ' : 1442917800 ,
' uploader ' : ' NBCU-NEWS ' ,
} ,
} ,
' expected_warnings ' : [ ' http-6000 is not available ' ]
} ,
} ,
{
{
' url ' : ' http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788 ' ,
' url ' : ' http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788 ' ,
@ -260,6 +264,22 @@ class NBCNewsIE(ThePlatformIE):
' description ' : ' md5:74752b7358afb99939c5f8bb2d1d04b1 ' ,
' description ' : ' md5:74752b7358afb99939c5f8bb2d1d04b1 ' ,
' upload_date ' : ' 20160420 ' ,
' upload_date ' : ' 20160420 ' ,
' timestamp ' : 1461152093 ,
' timestamp ' : 1461152093 ,
' uploader ' : ' NBCU-NEWS ' ,
} ,
} ,
{
' url ' : ' http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924 ' ,
' md5 ' : ' 6d236bf4f3dddc226633ce6e2c3f814d ' ,
' info_dict ' : {
' id ' : ' 314487875924 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The chaotic GOP immigration vote ' ,
' description ' : ' The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides. ' ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
' timestamp ' : 1406937606 ,
' upload_date ' : ' 20140802 ' ,
' uploader ' : ' NBCU-NEWS ' ,
' categories ' : [ ' MSNBC/Topics/Franchise/Best of last night ' , ' MSNBC/Topics/General/Congress ' ] ,
} ,
} ,
} ,
} ,
{
{
@ -290,105 +310,28 @@ class NBCNewsIE(ThePlatformIE):
}
}
else :
else :
# "feature" and "nightly-news" pages use theplatform.com
# "feature" and "nightly-news" pages use theplatform.com
display_id = mobj . group ( ' display_id ' )
video_id = mobj . group ( ' mpx_id ' )
webpage = self . _download_webpage ( url , display_id )
if not video_id . isdigit ( ) :
info = None
webpage = self . _download_webpage ( url , video_id )
bootstrap_json = self . _search_regex (
info = None
[ r ' (?m)(?:var \ s+(?:bootstrapJson|playlistData)|NEWS \ .videoObj) \ s*= \ s*( { .+});? \ s*$ ' ,
bootstrap_json = self . _search_regex (
r ' videoObj \ s*: \ s*( { .+}) ' , r ' data-video= " ([^ " ]+) " ' ] ,
[ r ' (?m)(?:var \ s+(?:bootstrapJson|playlistData)|NEWS \ .videoObj) \ s*= \ s*( { .+});? \ s*$ ' ,
webpage , ' bootstrap json ' , default = None )
r ' videoObj \ s*: \ s*( { .+}) ' , r ' data-video= " ([^ " ]+) " ' ] ,
bootstrap = self . _parse_json (
webpage , ' bootstrap json ' , default = None )
bootstrap_json , display_id , transform_source = unescapeHTML )
bootstrap = self . _parse_json (
if ' results ' in bootstrap :
bootstrap_json , video_id , transform_source = unescapeHTML )
info = bootstrap [ ' results ' ] [ 0 ] [ ' video ' ]
if ' results ' in bootstrap :
elif ' video ' in bootstrap :
info = bootstrap [ ' results ' ] [ 0 ] [ ' video ' ]
info = bootstrap [ ' video ' ]
elif ' video ' in bootstrap :
else :
info = bootstrap [ ' video ' ]
info = bootstrap
video_id = info [ ' mpxId ' ]
title = info [ ' title ' ]
subtitles = { }
caption_links = info . get ( ' captionLinks ' )
if caption_links :
for ( sub_key , sub_ext ) in ( ( ' smpte-tt ' , ' ttml ' ) , ( ' web-vtt ' , ' vtt ' ) , ( ' srt ' , ' srt ' ) ) :
sub_url = caption_links . get ( sub_key )
if sub_url :
subtitles . setdefault ( ' en ' , [ ] ) . append ( {
' url ' : sub_url ,
' ext ' : sub_ext ,
} )
formats = [ ]
for video_asset in info [ ' videoAssets ' ] :
video_url = video_asset . get ( ' publicUrl ' )
if not video_url :
continue
container = video_asset . get ( ' format ' )
asset_type = video_asset . get ( ' assetType ' ) or ' '
if container == ' ISM ' or asset_type == ' FireTV-Once ' :
continue
elif asset_type == ' OnceURL ' :
tp_formats , tp_subtitles = self . _extract_theplatform_smil (
video_url , video_id )
formats . extend ( tp_formats )
subtitles = self . _merge_subtitles ( subtitles , tp_subtitles )
else :
else :
tbr = int_or_none ( video_asset . get ( ' bitRate ' ) or video_asset . get ( ' bitrate ' ) , 1000 )
info = bootstrap
format_id = ' http %s ' % ( ' - %d ' % tbr if tbr else ' ' )
video_id = info [ ' mpxId ' ]
video_url = update_url_query (
video_url , { ' format ' : ' redirect ' } )
# resolve the url so that we can check availability and detect the correct extension
head = self . _request_webpage (
HEADRequest ( video_url ) , video_id ,
' Checking %s url ' % format_id ,
' %s is not available ' % format_id ,
fatal = False )
if head :
video_url = head . geturl ( )
formats . append ( {
' format_id ' : format_id ,
' url ' : video_url ,
' width ' : int_or_none ( video_asset . get ( ' width ' ) ) ,
' height ' : int_or_none ( video_asset . get ( ' height ' ) ) ,
' tbr ' : tbr ,
' container ' : video_asset . get ( ' format ' ) ,
} )
self . _sort_formats ( formats )
return {
return {
' _type ' : ' url_transparent ' ,
' id ' : video_id ,
' id ' : video_id ,
' title ' : title ,
# http://feed.theplatform.com/f/2E2eJC/nbcnews also works
' description ' : info . get ( ' description ' ) ,
' url ' : ' http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId= %s ' % video_id ,
' thumbnail ' : info . get ( ' thumbnail ' ) ,
' ie_key ' : ' ThePlatformFeed ' ,
' duration ' : int_or_none ( info . get ( ' duration ' ) ) ,
' timestamp ' : parse_iso8601 ( info . get ( ' pubDate ' ) or info . get ( ' pub_date ' ) ) ,
' formats ' : formats ,
' subtitles ' : subtitles ,
}
}
class MSNBCIE ( InfoExtractor ) :
# https URLs redirect to corresponding http ones
_VALID_URL = r ' https?://www \ .msnbc \ .com/[^/]+/watch/(?P<id>[^/]+) '
_TEST = {
' url ' : ' http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924 ' ,
' md5 ' : ' 6d236bf4f3dddc226633ce6e2c3f814d ' ,
' info_dict ' : {
' id ' : ' n_hayes_Aimm_140801_272214 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The chaotic GOP immigration vote ' ,
' description ' : ' The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides. ' ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
' timestamp ' : 1406937606 ,
' upload_date ' : ' 20140802 ' ,
' uploader ' : ' NBCU-NEWS ' ,
' categories ' : [ ' MSNBC/Topics/Franchise/Best of last night ' , ' MSNBC/Topics/General/Congress ' ] ,
} ,
}
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
embed_url = self . _html_search_meta ( ' embedURL ' , webpage )
return self . url_result ( embed_url )