@ -7,12 +7,13 @@ import base64
from . common import InfoExtractor
from . common import InfoExtractor
from . . utils import (
from . . utils import (
determine_ext ,
float_or_none ,
float_or_none ,
int_or_none ,
int_or_none ,
js_to_json ,
js_to_json ,
parse_iso8601 ,
mimetype2ext ,
mimetype2ext ,
determine_ext ,
parse_iso8601 ,
remove_start ,
)
)
@ -98,43 +99,6 @@ class NYTimesBaseIE(InfoExtractor):
' thumbnails ' : thumbnails ,
' thumbnails ' : thumbnails ,
}
}
def _extract_podcast_from_json ( self , json , page_id , webpage ) :
audio_data = self . _parse_json ( json , page_id , transform_source = js_to_json ) [ ' data ' ]
description = audio_data [ ' track ' ] . get ( ' description ' )
if not description :
description = self . _html_search_meta ( [ ' og:description ' , ' twitter:description ' ] , webpage )
episode_title = audio_data [ ' track ' ] [ ' title ' ]
episode_number = None
episode = audio_data [ ' podcast ' ] [ ' episode ' ] . split ( )
if episode :
episode_number = int_or_none ( episode [ - 1 ] )
video_id = episode [ - 1 ]
else :
video_id = page_id
podcast_title = audio_data [ ' podcast ' ] [ ' title ' ]
title = None
if podcast_title :
title = " %s : %s " % ( podcast_title , episode_title )
else :
title = episode_title
info_dict = {
' id ' : video_id ,
' title ' : title ,
' creator ' : audio_data [ ' track ' ] . get ( ' credit ' ) ,
' series ' : podcast_title ,
' episode ' : episode_title ,
' episode_number ' : episode_number ,
' url ' : audio_data [ ' track ' ] [ ' source ' ] ,
' duration ' : audio_data [ ' track ' ] . get ( ' duration ' ) ,
' description ' : description ,
}
return info_dict
class NYTimesIE ( NYTimesBaseIE ) :
class NYTimesIE ( NYTimesBaseIE ) :
_VALID_URL = r ' https?://(?:(?:www \ .)?nytimes \ .com/video/(?:[^/]+/)+?|graphics8 \ .nytimes \ .com/bcvideo/ \ d+(?: \ . \ d+)?/iframe/embed \ .html \ ?videoId=)(?P<id> \ d+) '
_VALID_URL = r ' https?://(?:(?:www \ .)?nytimes \ .com/video/(?:[^/]+/)+?|graphics8 \ .nytimes \ .com/bcvideo/ \ d+(?: \ . \ d+)?/iframe/embed \ .html \ ?videoId=)(?P<id> \ d+) '
@ -181,33 +145,79 @@ class NYTimesArticleIE(NYTimesBaseIE):
' url ' : ' http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html ' ,
' url ' : ' http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html ' ,
' md5 ' : ' e0d52040cafb07662acf3c9132db3575 ' ,
' md5 ' : ' e0d52040cafb07662acf3c9132db3575 ' ,
' info_dict ' : {
' info_dict ' : {
' id ' : ' 20 ' ,
' id ' : ' 10000000470906 2' ,
' title ' : " The Run-Up: \u2018 He Was Like an Octopus \u2019 " ,
' title ' : ' The Run-Up: ‘ He Was Like an Octopus’ ' ,
' ext ' : ' mp3 ' ,
' ext ' : ' mp3 ' ,
' description ' : ' We go behind the story of the two women who told us that Donald Trump touched them inappropriately (which he denies) and check in on Hillary Clinton’ s campaign. ' ,
' description ' : ' md5:fb5c6b93b12efc51649b4847fe066ee4 ' ,
' series ' : ' The Run-Up ' ,
' episode ' : ' ‘ He Was Like an Octopus’ ' ,
' episode_number ' : 20 ,
' duration ' : 2130 ,
}
}
} , {
} , {
' url ' : ' http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html ' ,
' url ' : ' http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html ' ,
' md5 ' : ' 66fb5471d7ef15da98af176dc1af4cb9 ' ,
' info_dict ' : {
' info_dict ' : {
' id ' : ' inside-the-new-york-times-book-review-the-rise-of-hitler ' ,
' id ' : ' 100000004709479 ' ,
' title ' : " The Rise of Hitler " ,
' title ' : ' The Rise of Hitler ' ,
' ext ' : ' mp3 ' ,
' ext ' : ' mp3 ' ,
' description ' : ' Adam Kirsch discusses Volker Ullrich \' s new biography of Hitler; Billy Collins talks about his latest collection of poems; and iO Tillett Wright on his new memoir, " Darling Days. " ' ,
' description ' : ' md5:bce877fd9e3444990cb141875fab0028 ' ,
}
' creator ' : ' Pamela Paul ' ,
' duration ' : 3475 ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} , {
} , {
' url ' : ' http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1 ' ,
' url ' : ' http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1 ' ,
' only_matching ' : True ,
' only_matching ' : True ,
} ]
} ]
def _extract_podcast_from_json ( self , json , page_id , webpage ) :
podcast_audio = self . _parse_json (
json , page_id , transform_source = js_to_json )
audio_data = podcast_audio [ ' data ' ]
track = audio_data [ ' track ' ]
episode_title = track [ ' title ' ]
video_url = track [ ' source ' ]
description = track . get ( ' description ' ) or self . _html_search_meta (
[ ' og:description ' , ' twitter:description ' ] , webpage )
podcast_title = audio_data . get ( ' podcast ' , { } ) . get ( ' title ' )
title = ( ' %s : %s ' % ( podcast_title , episode_title )
if podcast_title else episode_title )
episode = audio_data . get ( ' podcast ' , { } ) . get ( ' episode ' ) or ' '
episode_number = int_or_none ( self . _search_regex (
r ' [Ee]pisode \ s+( \ d+) ' , episode , ' epidode number ' , default = None ) )
return {
' id ' : remove_start ( podcast_audio . get ( ' target ' ) , ' FT ' ) or page_id ,
' url ' : video_url ,
' title ' : title ,
' description ' : description ,
' creator ' : track . get ( ' credit ' ) ,
' series ' : podcast_title ,
' episode ' : episode_title ,
' episode_number ' : episode_number ,
' duration ' : int_or_none ( track . get ( ' duration ' ) ) ,
}
def _real_extract ( self , url ) :
def _real_extract ( self , url ) :
page_id = self . _match_id ( url )
page_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , page_id )
webpage = self . _download_webpage ( url , page_id )
video_id = self . _html_search_regex ( r ' data-videoid= " ( \ d+) " ' , webpage , ' video id ' , None , False )
video_id = self . _search_regex (
r ' data-videoid=[ " \' ]( \ d+) ' , webpage , ' video id ' ,
default = None , fatal = False )
if video_id is not None :
if video_id is not None :
return self . _extract_video_from_id ( video_id )
return self . _extract_video_from_id ( video_id )
data_json = self . _html_search_regex ( r ' NYTD \ .FlexTypes \ .push \ (( { .*}) \ ); ' , webpage , ' json data ' )
podcast_data = self . _search_regex (
return self . _extract_podcast_from_json ( data_json , page_id , webpage )
( r ' NYTD \ .FlexTypes \ .push \ s* \ ( \ s*( { .+?}) \ s* \ ) \ s*; \ s*</script ' ,
r ' NYTD \ .FlexTypes \ .push \ s* \ ( \ s*( { .+}) \ s* \ ) \ s*; ' ) ,
webpage , ' podcast data ' )
return self . _extract_podcast_from_json ( podcast_data , page_id , webpage )