@ -1,8 +1,10 @@
import re
import xml . etree . ElementTree
import json
from . common import InfoExtractor
from . . utils import (
compat_urlparse ,
determine_ext ,
)
@ -14,10 +16,9 @@ class AppleTrailersIE(InfoExtractor):
u " playlist " : [
{
u " file " : u " manofsteel-trailer4.mov " ,
u " md5 " : u " 11874af099d480cc09e103b189805d5f " ,
u " md5 " : u " d97a8e575432dbcb81b7c3acb741f8a8 " ,
u " info_dict " : {
u " duration " : 111 ,
u " thumbnail " : u " http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg " ,
u " title " : u " Trailer 4 " ,
u " upload_date " : u " 20130523 " ,
u " uploader_id " : u " wb " ,
@ -25,10 +26,9 @@ class AppleTrailersIE(InfoExtractor):
} ,
{
u " file " : u " manofsteel-trailer3.mov " ,
u " md5 " : u " 07a0a262aae5afe68120eed61137ab34 " ,
u " md5 " : u " b8017b7131b721fb4e8d6f49e1df908c " ,
u " info_dict " : {
u " duration " : 182 ,
u " thumbnail " : u " http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg " ,
u " title " : u " Trailer 3 " ,
u " upload_date " : u " 20130417 " ,
u " uploader_id " : u " wb " ,
@ -36,10 +36,9 @@ class AppleTrailersIE(InfoExtractor):
} ,
{
u " file " : u " manofsteel-trailer.mov " ,
u " md5 " : u " e401fde0813008e3307e54b6f384cff1 " ,
u " md5 " : u " d0f1e1150989b9924679b441f3404d48 " ,
u " info_dict " : {
u " duration " : 148 ,
u " thumbnail " : u " http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg " ,
u " title " : u " Trailer " ,
u " upload_date " : u " 20121212 " ,
u " uploader_id " : u " wb " ,
@ -47,10 +46,9 @@ class AppleTrailersIE(InfoExtractor):
} ,
{
u " file " : u " manofsteel-teaser.mov " ,
u " md5 " : u " 76b392f2ae9e7c98b22913c10a639c97 " ,
u " md5 " : u " 5fe08795b943eb2e757fa95cb6def1cb " ,
u " info_dict " : {
u " duration " : 93 ,
u " thumbnail " : u " http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg " ,
u " title " : u " Teaser " ,
u " upload_date " : u " 20120721 " ,
u " uploader_id " : u " wb " ,
@ -59,87 +57,61 @@ class AppleTrailersIE(InfoExtractor):
]
}
_JSON_RE = r ' iTunes.playURL \ ((.*?) \ ); '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
movie = mobj . group ( ' movie ' )
uploader_id = mobj . group ( ' company ' )
playlist_url = url. partition ( u ' ? ' ) [ 0 ] + u ' /includes/playlists/web.inc '
playlist_url = compat_urlparse. urljoin ( url , u ' includes/playlists/itunes.inc ' )
playlist_snippet = self . _download_webpage ( playlist_url , movie )
playlist_cleaned = re . sub ( r ' (?s)<script>.*?</script> ' , u ' ' , playlist_snippet )
playlist_cleaned = re . sub ( r ' (?s)<script[^<]*?>.*?</script> ' , u ' ' , playlist_snippet )
playlist_cleaned = re . sub ( r ' <img ([^<]*?)> ' , r ' <img \ 1/> ' , playlist_cleaned )
# The ' in the onClick attributes are not escaped, it couldn't be parsed
# with xml.etree.ElementTree.fromstring
# like: http://trailers.apple.com/trailers/wb/gravity/
def _clean_json ( m ) :
return u ' iTunes.playURL( %s ); ' % m . group ( 1 ) . replace ( ' \' ' , ' ' ' )
playlist_cleaned = re . sub ( self . _JSON_RE , _clean_json , playlist_cleaned )
playlist_html = u ' <html> ' + playlist_cleaned + u ' </html> '
size_cache = { }
doc = xml . etree . ElementTree . fromstring ( playlist_html )
playlist = [ ]
for li in doc . findall ( ' ./div/ul/li ' ) :
title = li . find ( ' .//h3 ' ) . text
on_click = li . find ( ' .//a ' ) . attrib [ ' onClick ' ]
trailer_info_json = self . _search_regex ( self . _JSON_RE ,
on_click , u ' trailer info ' )
trailer_info = json . loads ( trailer_info_json )
title = trailer_info [ ' title ' ]
video_id = movie + ' - ' + re . sub ( r ' [^a-zA-Z0-9] ' , ' ' , title ) . lower ( )
thumbnail = li . find ( ' .//img ' ) . attrib [ ' src ' ]
upload_date = trailer_info [ ' posted ' ] . replace ( ' - ' , ' ' )
date_el = li . find ( ' .//p ' )
upload_date = None
m = re . search ( r ' : \ s?(?P<month>[0-9] {2} )/(?P<day>[0-9] {2} )/(?P<year>[0-9] {2} ) ' , date_el . text )
if m :
upload_date = u ' 20 ' + m . group ( ' year ' ) + m . group ( ' month ' ) + m . group ( ' day ' )
runtime_el = date_el . find ( ' ./br ' )
m = re . search ( r ' : \ s?(?P<minutes>[0-9]+):(?P<seconds>[0-9] { 1,2}) ' , runtime_el . tail )
runtime = trailer_info [ ' runtime ' ]
m = re . search ( r ' (?P<minutes>[0-9]+):(?P<seconds>[0-9] { 1,2}) ' , runtime )
duration = None
if m :
duration = 60 * int ( m . group ( ' minutes ' ) ) + int ( m . group ( ' seconds ' ) )
formats = [ ]
for formats_el in li . findall ( ' .//a ' ) :
if formats_el . attrib [ ' class ' ] != ' OverlayPanel ' :
continue
target = formats_el . attrib [ ' target ' ]
format_code = formats_el . text
if ' Automatic ' in format_code :
continue
first_url = trailer_info [ ' url ' ]
trailer_id = first_url . split ( ' / ' ) [ - 1 ] . rpartition ( ' _ ' ) [ 0 ]
settings_json_url = compat_urlparse . urljoin ( url , ' includes/settings/ %s .json ' % trailer_id )
settings_json = self . _download_webpage ( settings_json_url , trailer_id , u ' Downloading settings json ' )
settings = json . loads ( settings_json )
size_q = formats_el . attrib [ ' href ' ]
size_id = size_q . rpartition ( ' #videos- ' ) [ 2 ]
if size_id not in size_cache :
size_url = url + size_q
sizepage_html = self . _download_webpage (
size_url , movie ,
note = u ' Downloading size info %s ' % size_id ,
errnote = u ' Error while downloading size info %s ' % size_id ,
)
_doc = xml . etree . ElementTree . fromstring ( sizepage_html )
size_cache [ size_id ] = _doc
sizepage_doc = size_cache [ size_id ]
links = sizepage_doc . findall ( ' .// { http://www.w3.org/1999/xhtml}ul/ { http://www.w3.org/1999/xhtml}li/ { http://www.w3.org/1999/xhtml}a ' )
for vid_a in links :
href = vid_a . get ( ' href ' )
if not href . endswith ( target ) :
continue
detail_q = href . partition ( ' # ' ) [ 0 ]
detail_url = url + ' / ' + detail_q
m = re . match ( r ' includes/(?P<detail_id>[^/]+)/ ' , detail_q )
detail_id = m . group ( ' detail_id ' )
detail_html = self . _download_webpage (
detail_url , movie ,
note = u ' Downloading detail %s %s ' % ( detail_id , size_id ) ,
errnote = u ' Error while downloading detail %s %s ' % ( detail_id , size_id )
)
detail_doc = xml . etree . ElementTree . fromstring ( detail_html )
movie_link_el = detail_doc . find ( ' .// { http://www.w3.org/1999/xhtml}a ' )
assert movie_link_el . get ( ' class ' ) == ' movieLink '
movie_link = movie_link_el . get ( ' href ' ) . partition ( ' ? ' ) [ 0 ] . replace ( ' _ ' , ' _h ' )
ext = determine_ext ( movie_link )
assert ext == ' mov '
formats . append ( {
' format ' : format_code ,
' ext ' : ext ,
' url ' : movie_link ,
} )
formats = [ ]
for format in settings [ ' metadata ' ] [ ' sizes ' ] :
# The src is a file pointing to the real video file
format_url = re . sub ( r ' _( \ d*p.mov) ' , r ' _h \ 1 ' , format [ ' src ' ] )
formats . append ( {
' url ' : format_url ,
' ext ' : determine_ext ( format_url ) ,
' format ' : format [ ' type ' ] ,
' width ' : format [ ' width ' ] ,
' height ' : int ( format [ ' height ' ] ) ,
} )
formats = sorted ( formats , key = lambda f : ( f [ ' height ' ] , f [ ' width ' ] ) )
info = {
' _type ' : ' video ' ,