@ -1,4 +1,3 @@
import datetime
import itertools
import json
import re
@ -6,86 +5,85 @@ import re
from . common import InfoExtractor , SearchInfoExtractor
from . . utils import (
compat_urllib_parse ,
ExtractorError ,
compat_urlparse ,
determine_ext ,
clean_html ,
)
class YahooIE ( InfoExtractor ) :
IE_DESC = u ' Yahoo screen '
_VALID_URL = r ' http://screen \ .yahoo \ .com/.*?-(?P<id> \ d*?) \ .html '
_TEST = {
u ' url ' : u ' http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html ' ,
u ' file ' : u ' 214727115.flv ' ,
u ' md5 ' : u ' 2e717f169c1be93d84d3794a00d4a325 ' ,
u ' info_dict ' : {
u " title " : u " Julian Smith & Travis Legg Watch Julian Smith "
_TESTS = [
{
u ' url ' : u ' http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html ' ,
u ' file ' : u ' 214727115.mp4 ' ,
u ' info_dict ' : {
u ' title ' : u ' Julian Smith & Travis Legg Watch Julian Smith ' ,
u ' description ' : u ' Julian and Travis watch Julian Smith ' ,
} ,
} ,
u ' skip ' : u ' Requires rtmpdump '
}
{
u ' url ' : u ' http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html ' ,
u ' file ' : u ' 103000935.flv ' ,
u ' info_dict ' : {
u ' title ' : u ' The Cougar Lies with Spanish Moss ' ,
u ' description ' : u ' Agent Topple \' s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about? ' ,
} ,
u ' params ' : {
# Requires rtmpdump
u ' skip_download ' : True ,
} ,
} ,
]
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
raise ExtractorError ( u ' Invalid URL: %s ' % url )
video_id = mobj . group ( ' id ' )
webpage = self . _download_webpage ( url , video_id )
m_id = re . search ( r ' YUI \ .namespace \ ( " Media " \ ) \ .CONTENT_ID = " (?P<new_id>.+?) " ; ' , webpage )
if m_id is None :
# TODO: Check which url parameters are required
info_url = ' http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id= %s ;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy ' % video_id
webpage = self . _download_webpage ( info_url , video_id , u ' Downloading info webpage ' )
info_re = r ''' <title><! \ [CDATA \ [(?P<title>.*?) \ ] \ ]></title>.*
< description > < ! \[ CDATA \[ ( ? P < description > . * ? ) \] \] > < / description > . *
< media : pubStart > < ! \[ CDATA \[ ( ? P < date > . * ? ) \ . * \] \] > < / media : pubStart > . *
< media : content \ medium = " image " \ url = " (?P<thumb>.*?) " \ name = " LARGETHUMB "
'''
self . report_extraction ( video_id )
m_info = re . search ( info_re , webpage , re . VERBOSE | re . DOTALL )
if m_info is None :
raise ExtractorError ( u ' Unable to extract video info ' )
video_title = m_info . group ( ' title ' )
video_description = m_info . group ( ' description ' )
video_thumb = m_info . group ( ' thumb ' )
video_date = m_info . group ( ' date ' )
video_date = datetime . datetime . strptime ( video_date , ' % m/ %d / % Y ' ) . strftime ( ' % Y % m %d ' )
# TODO: Find a way to get mp4 videos
rest_url = ' http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id= %s ;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy ' % video_id
webpage = self . _download_webpage ( rest_url , video_id , u ' Downloading video url webpage ' )
m_rest = re . search ( r ' <media:content url= " (?P<url>.*?) " path= " (?P<path>.*?) " ' , webpage )
video_url = m_rest . group ( ' url ' )
video_path = m_rest . group ( ' path ' )
if m_rest is None :
raise ExtractorError ( u ' Unable to extract video url ' )
items_json = self . _search_regex ( r ' YVIDEO_INIT_ITEMS = ( { .*?});$ ' ,
webpage , u ' items ' , flags = re . MULTILINE )
items = json . loads ( items_json )
info = items [ ' mediaItems ' ] [ ' query ' ] [ ' results ' ] [ ' mediaObj ' ] [ 0 ]
meta = info [ ' meta ' ]
formats = [ ]
for s in info [ ' streams ' ] :
format_info = {
' width ' : s . get ( ' width ' ) ,
' height ' : s . get ( ' height ' ) ,
' bitrate ' : s . get ( ' bitrate ' ) ,
}
host = s [ ' host ' ]
path = s [ ' path ' ]
if host . startswith ( ' rtmp ' ) :
format_info . update ( {
' url ' : host ,
' play_path ' : path ,
' ext ' : ' flv ' ,
} )
else :
format_url = compat_urlparse . urljoin ( host , path )
format_info [ ' url ' ] = format_url
format_info [ ' ext ' ] = determine_ext ( format_url )
formats . append ( format_info )
formats = sorted ( formats , key = lambda f : ( f [ ' height ' ] , f [ ' width ' ] ) )
info = {
' id ' : video_id ,
' title ' : meta [ ' title ' ] ,
' formats ' : formats ,
' description ' : clean_html ( meta [ ' description ' ] ) ,
' thumbnail ' : meta [ ' thumbnail ' ] ,
}
# TODO: Remove when #980 has been merged
info . update ( formats [ - 1 ] )
else : # We have to use a different method if another id is defined
long_id = m_id . group ( ' new_id ' )
info_url = ' http://video.query.yahoo.com/v1/public/yql?q=SELECT % 20* %20F ROM % 20yahoo.media.video.streams % 20WHERE %20i d % 3D % 22 ' + long_id + ' %22% 20AND %20f ormat % 3D % 22mp4 % 2Cflv %22% 20AND % 20protocol % 3D %22r tmp % 2Chttp %22% 20AND % 20plrs % 3D %2286G j0vCaSzV_Iuf6hNylf2 %22% 20AND %20a cctid % 3D %22389% 22 % 20AND % 20plidl % 3D %22% 22 % 20AND % 20pspid % 3D %22792700001% 22 % 20AND %20o ffnetwork % 3D %22f alse %22% 20AND %20s ite % 3D %22i vy %22% 20AND %20la ng % 3D %22e n-US %22% 20AND %20r egion % 3D % 22US %22% 20AND %20o verride % 3D % 22none %22% 3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335 '
webpage = self . _download_webpage ( info_url , video_id , u ' Downloading info json ' )
json_str = re . search ( r ' YUI.Env.JSONP.yui.*? \ ((.*?) \ ); ' , webpage ) . group ( 1 )
info = json . loads ( json_str )
res = info [ u ' query ' ] [ u ' results ' ] [ u ' mediaObj ' ] [ 0 ]
stream = res [ u ' streams ' ] [ 0 ]
video_path = stream [ u ' path ' ]
video_url = stream [ u ' host ' ]
meta = res [ u ' meta ' ]
video_title = meta [ u ' title ' ]
video_description = meta [ u ' description ' ]
video_thumb = meta [ u ' thumbnail ' ]
video_date = None # I can't find it
return info
info_dict = {
' id ' : video_id ,
' url ' : video_url ,
' play_path ' : video_path ,
' title ' : video_title ,
' description ' : video_description ,
' thumbnail ' : video_thumb ,
' upload_date ' : video_date ,
' ext ' : ' flv ' ,
}
return info_dict
class YahooSearchIE ( SearchInfoExtractor ) :
IE_DESC = u ' Yahoo screen search '