@ -490,6 +490,8 @@ class FileDownloader(object):
updatetime : Use the Last - modified header to set output file timestamps .
writedescription : Write the video description to a . description file
writeinfojson : Write the video description to a . info . json file
writesubtitles : Write the video subtitles to a . srt file
subtitleslang : Language of the subtitles to download
"""
params = None
@ -681,6 +683,10 @@ class FileDownloader(object):
""" Report that the description file is being written """
self . to_screen ( u ' [info] Writing video description to: ' + descfn )
def report_writesubtitles ( self , srtfn ) :
""" Report that the subtitles file is being written """
self . to_screen ( u ' [info] Writing video subtitles to: ' + srtfn )
def report_writeinfojson ( self , infofn ) :
""" Report that the metadata file has been written """
self . to_screen ( u ' [info] Video description metadata as JSON to: ' + infofn )
@ -808,6 +814,21 @@ class FileDownloader(object):
except ( OSError , IOError ) :
self . trouble ( u ' ERROR: Cannot write description file ' + descfn )
return
if self . params . get ( ' writesubtitles ' , False ) and ' subtitles ' in info_dict and info_dict [ ' subtitles ' ] :
# subtitles download errors are already managed as troubles in relevant IE
# that way it will silently go on when used with unsupporting IE
try :
srtfn = filename . rsplit ( ' . ' , 1 ) [ 0 ] + u ' .srt '
self . report_writesubtitles ( srtfn )
srtfile = open ( _encodeFilename ( srtfn ) , ' wb ' )
try :
srtfile . write ( info_dict [ ' subtitles ' ] . encode ( ' utf-8 ' ) )
finally :
srtfile . close ( )
except ( OSError , IOError ) :
self . trouble ( u ' ERROR: Cannot write subtitles file ' + descfn )
return
if self . params . get ( ' writeinfojson ' , False ) :
infofn = filename + u ' .info.json '
@ -1206,6 +1227,10 @@ class YoutubeIE(InfoExtractor):
""" Report attempt to download video info webpage. """
self . _downloader . to_screen ( u ' [youtube] %s : Downloading video info webpage ' % video_id )
def report_video_subtitles_download ( self , video_id ) :
""" Report attempt to download video info webpage. """
self . _downloader . to_screen ( u ' [youtube] %s : Downloading video subtitles ' % video_id )
def report_information_extraction ( self , video_id ) :
""" Report attempt to extract video information. """
self . _downloader . to_screen ( u ' [youtube] %s : Extracting video information ' % video_id )
@ -1218,6 +1243,23 @@ class YoutubeIE(InfoExtractor):
""" Indicate the download will use the RTMP protocol. """
self . _downloader . to_screen ( u ' [youtube] RTMP download detected ' )
def _closed_captions_xml_to_srt ( self , xml_string ) :
srt = ' '
texts = re . findall ( r ' <text start= " ([ \ d \ .]+) " ( dur= " ([ \ d \ .]+) " )?>([^<]+)</text> ' , xml_string , re . MULTILINE )
# TODO parse xml instead of regex
for n , ( start , dur_tag , dur , caption ) in enumerate ( texts ) :
if not dur : dur = ' 4 '
start = float ( start )
end = start + float ( dur )
start = " %02i : %02i : %02i , %03i " % ( start / ( 60 * 60 ) , start / 60 % 60 , start % 60 , start % 1 * 1000 )
end = " %02i : %02i : %02i , %03i " % ( end / ( 60 * 60 ) , end / 60 % 60 , end % 60 , end % 1 * 1000 )
caption = re . sub ( ur ' (?u)&(.+?); ' , htmlentity_transform , caption )
caption = re . sub ( ur ' (?u)&(.+?); ' , htmlentity_transform , caption ) # double cycle, inentional
srt + = str ( n ) + ' \n '
srt + = start + ' --> ' + end + ' \n '
srt + = caption + ' \n \n '
return srt
def _print_formats ( self , formats ) :
print ' Available formats: '
for x in formats :
@ -1389,6 +1431,37 @@ class YoutubeIE(InfoExtractor):
vwebpage_doc = lxml . etree . parse ( StringIO . StringIO ( video_webpage ) , html_parser )
video_description = u ' ' . join ( vwebpage_doc . xpath ( ' id( " eow-description " )//text() ' ) )
# TODO use another parser
# closed captions
video_subtitles = None
if self . _downloader . params . get ( ' writesubtitles ' , False ) :
self . report_video_subtitles_download ( video_id )
request = urllib2 . Request ( ' http://video.google.com/timedtext?hl=en&type=list&v= %s ' % video_id )
try :
srt_list = urllib2 . urlopen ( request ) . read ( )
except ( urllib2 . URLError , httplib . HTTPException , socket . error ) , err :
self . _downloader . trouble ( u ' WARNING: unable to download video subtitles: %s ' % str ( err ) )
else :
srt_lang_list = re . findall ( r ' lang_code= " ([ \ w \ -]+) " ' , srt_list )
if srt_lang_list :
if self . _downloader . params . get ( ' subtitleslang ' , False ) :
srt_lang = self . _downloader . params . get ( ' subtitleslang ' )
elif ' en ' in srt_lang_list :
srt_lang = ' en '
else :
srt_lang = srt_lang_list [ 0 ]
if not srt_lang in srt_lang_list :
self . _downloader . trouble ( u ' WARNING: no closed captions found in the specified language ' )
else :
request = urllib2 . Request ( ' http://video.google.com/timedtext?hl=en&lang= %s &v= %s ' % ( srt_lang , video_id ) )
try :
srt_xml = urllib2 . urlopen ( request ) . read ( )
except ( urllib2 . URLError , httplib . HTTPException , socket . error ) , err :
self . _downloader . trouble ( u ' WARNING: unable to download video subtitles: %s ' % str ( err ) )
else :
video_subtitles = self . _closed_captions_xml_to_srt ( srt_xml . decode ( ' utf-8 ' ) )
else :
self . _downloader . trouble ( u ' WARNING: video has no closed captions ' )
# token
video_token = urllib . unquote_plus ( video_info [ ' token ' ] [ 0 ] )
@ -1461,6 +1534,7 @@ class YoutubeIE(InfoExtractor):
' thumbnail ' : video_thumbnail . decode ( ' utf-8 ' ) ,
' description ' : video_description ,
' player_url ' : player_url ,
' subtitles ' : video_subtitles
} )
except UnavailableVideoError , err :
self . _downloader . trouble ( u ' \n ERROR: unable to download video ' )
@ -4319,6 +4393,12 @@ def parseOpts():
action = ' store ' , dest = ' format_limit ' , metavar = ' FORMAT ' , help = ' highest quality format to download ' )
video_format . add_option ( ' -F ' , ' --list-formats ' ,
action = ' store_true ' , dest = ' listformats ' , help = ' list all available formats (currently youtube only) ' )
video_format . add_option ( ' --write-srt ' ,
action = ' store_true ' , dest = ' writesubtitles ' ,
help = ' write video closed captions to a .srt file (currently youtube only) ' , default = False )
video_format . add_option ( ' --srt-lang ' ,
action = ' store ' , dest = ' subtitleslang ' , metavar = ' LANG ' ,
help = ' language of the closed captions to download (optional) use IETF language tags like \' en \' ' )
verbosity . add_option ( ' -q ' , ' --quiet ' ,
@ -4583,6 +4663,8 @@ def _real_main():
' updatetime ' : opts . updatetime ,
' writedescription ' : opts . writedescription ,
' writeinfojson ' : opts . writeinfojson ,
' writesubtitles ' : opts . writesubtitles ,
' subtitleslang ' : opts . subtitleslang ,
' matchtitle ' : opts . matchtitle ,
' rejecttitle ' : opts . rejecttitle ,
' max_downloads ' : opts . max_downloads ,