[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.
pull/12502/head
John Hawkinson 8 years ago committed by Yen Chi Hsuan
parent 772b5ff57f
commit 46b18f2349

@ -0,0 +1,72 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
)
class BostonGlobeIE(InfoExtractor):
_VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?'
_TESTS = [
{
'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html',
'md5': '0a62181079c85c2d2b618c9a738aedaf',
'info_dict': {
'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood',
'id': '5320421710001',
'ext': 'mp4',
'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.',
'timestamp': 1486877593,
'upload_date': '20170212',
'uploader_id': '245991542',
},
},
{
# Embedded youtube video; we hand it off to the Generic extractor.
'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html',
'md5': '582b40327089d5c0c949b3c54b13c24b',
'info_dict': {
'title': "Who Is Matt Damon's Favorite Batman?",
'id': 'ZW1QCnlA6Qc',
'ext': 'mp4',
'upload_date': '20170217',
'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb',
'uploader': 'The Late Late Show with James Corden',
'uploader_id': 'TheLateLateShow',
},
'expected_warnings': ['404'],
},
]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
page_title = self._og_search_title(webpage, default=None)
# <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject">
entries = []
for video in re.findall(r'(?i)(<video[^>]+>)', webpage):
attrs = extract_attributes(video)
video_id = attrs.get('data-brightcove-video-id')
account_id = attrs.get('data-account')
player_id = attrs.get('data-player')
embed = attrs.get('data-embed')
if video_id and account_id and player_id and embed:
entries.append(
'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
% (account_id, player_id, embed, video_id))
if len(entries) == 0:
return self.url_result(url, 'Generic')
elif len(entries) == 1:
return self.url_result(entries[0], 'BrightcoveNew')
else:
return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew')

@ -36,34 +36,35 @@ from ..utils import (
clean_html, clean_html,
compiled_regex_type, compiled_regex_type,
determine_ext, determine_ext,
determine_protocol,
error_to_compat_str, error_to_compat_str,
ExtractorError, ExtractorError,
extract_attributes,
fix_xml_ampersands, fix_xml_ampersands,
float_or_none, float_or_none,
GeoRestrictedError, GeoRestrictedError,
GeoUtils, GeoUtils,
int_or_none, int_or_none,
js_to_json, js_to_json,
mimetype2ext,
orderedSet,
parse_codecs,
parse_duration,
parse_iso8601, parse_iso8601,
parse_m3u8_attributes,
RegexNotFoundError, RegexNotFoundError,
sanitize_filename,
sanitized_Request, sanitized_Request,
sanitize_filename,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
unified_timestamp, unified_timestamp,
update_Request,
update_url_query,
urljoin,
url_basename, url_basename,
xpath_element, xpath_element,
xpath_text, xpath_text,
xpath_with_ns, xpath_with_ns,
determine_protocol,
parse_duration,
mimetype2ext,
update_Request,
update_url_query,
parse_m3u8_attributes,
extract_attributes,
parse_codecs,
urljoin,
) )
@ -714,6 +715,13 @@ class InfoExtractor(object):
video_info['title'] = video_title video_info['title'] = video_title
return video_info return video_info
def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
urlrs = orderedSet(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
@staticmethod @staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
"""Returns a playlist""" """Returns a playlist"""

@ -117,6 +117,7 @@ from .bleacherreport import (
from .blinkx import BlinkxIE from .blinkx import BlinkxIE
from .bloomberg import BloombergIE from .bloomberg import BloombergIE
from .bokecc import BokeCCIE from .bokecc import BokeCCIE
from .bostonglobe import BostonGlobeIE
from .bpb import BpbIE from .bpb import BpbIE
from .br import BRIE from .br import BRIE
from .bravotv import BravoTVIE from .bravotv import BravoTVIE

@ -1841,14 +1841,6 @@ class GenericIE(InfoExtractor):
video_description = self._og_search_description(webpage, default=None) video_description = self._og_search_description(webpage, default=None)
video_thumbnail = self._og_search_thumbnail(webpage, default=None) video_thumbnail = self._og_search_thumbnail(webpage, default=None)
# Helper method
def _playlist_from_matches(matches, getter=None, ie=None):
urlrs = orderedSet(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for Brightcove Legacy Studio embeds # Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls: if bc_urls:
@ -1869,28 +1861,28 @@ class GenericIE(InfoExtractor):
# Look for Brightcove New Studio embeds # Look for Brightcove New Studio embeds
bc_urls = BrightcoveNewIE._extract_urls(webpage) bc_urls = BrightcoveNewIE._extract_urls(webpage)
if bc_urls: if bc_urls:
return _playlist_from_matches(bc_urls, ie='BrightcoveNew') return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
# Look for ThePlatform embeds # Look for ThePlatform embeds
tp_urls = ThePlatformIE._extract_urls(webpage) tp_urls = ThePlatformIE._extract_urls(webpage)
if tp_urls: if tp_urls:
return _playlist_from_matches(tp_urls, ie='ThePlatform') return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
# Look for Vessel embeds # Look for Vessel embeds
vessel_urls = VesselIE._extract_urls(webpage) vessel_urls = VesselIE._extract_urls(webpage)
if vessel_urls: if vessel_urls:
return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())
# Look for embedded rtl.nl player # Look for embedded rtl.nl player
matches = re.findall( matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
webpage) webpage)
if matches: if matches:
return _playlist_from_matches(matches, ie='RtlNl') return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
vimeo_urls = VimeoIE._extract_urls(url, webpage) vimeo_urls = VimeoIE._extract_urls(url, webpage)
if vimeo_urls: if vimeo_urls:
return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
vid_me_embed_url = self._search_regex( vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
@ -1912,25 +1904,25 @@ class GenericIE(InfoExtractor):
(?:embed|v|p)/.+?) (?:embed|v|p)/.+?)
\1''', webpage) \1''', webpage)
if matches: if matches:
return _playlist_from_matches( return self.playlist_from_matches(
matches, lambda m: unescapeHTML(m[1])) matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
# Look for lazyYT YouTube embed # Look for lazyYT YouTube embed
matches = re.findall( matches = re.findall(
r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
if matches: if matches:
return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
# Look for Wordpress "YouTube Video Importer" plugin # Look for Wordpress "YouTube Video Importer" plugin
matches = re.findall(r'''(?x)<div[^>]+ matches = re.findall(r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
if matches: if matches:
return _playlist_from_matches(matches, lambda m: m[-1]) return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
matches = DailymotionIE._extract_urls(webpage) matches = DailymotionIE._extract_urls(webpage)
if matches: if matches:
return _playlist_from_matches(matches) return self.playlist_from_matches(matches, video_id, video_title)
# Look for embedded Dailymotion playlist player (#3822) # Look for embedded Dailymotion playlist player (#3822)
m = re.search( m = re.search(
@ -1939,8 +1931,8 @@ class GenericIE(InfoExtractor):
playlists = re.findall( playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists: if playlists:
return _playlist_from_matches( return self.playlist_from_matches(
playlists, lambda p: '//dailymotion.com/playlist/%s' % p) playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player # Look for embedded Wistia player
match = re.search( match = re.search(
@ -2047,8 +2039,9 @@ class GenericIE(InfoExtractor):
if mobj is not None: if mobj is not None:
embeds = self._parse_json(mobj.group(1), video_id, fatal=False) embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
if embeds: if embeds:
return _playlist_from_matches( return self.playlist_from_matches(
embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') embeds, video_id, video_title,
getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
# Look for Aparat videos # Look for Aparat videos
mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
@ -2110,13 +2103,13 @@ class GenericIE(InfoExtractor):
# Look for funnyordie embed # Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
if matches: if matches:
return _playlist_from_matches( return self.playlist_from_matches(
matches, getter=unescapeHTML, ie='FunnyOrDie') matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
# Look for BBC iPlayer embed # Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches: if matches:
return _playlist_from_matches(matches, ie='BBCCoUk') return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')
# Look for embedded RUTV player # Look for embedded RUTV player
rutv_url = RUTVIE._extract_url(webpage) rutv_url = RUTVIE._extract_url(webpage)
@ -2131,32 +2124,32 @@ class GenericIE(InfoExtractor):
# Look for embedded SportBox player # Look for embedded SportBox player
sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
if sportbox_urls: if sportbox_urls:
return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')
# Look for embedded XHamster player # Look for embedded XHamster player
xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
if xhamster_urls: if xhamster_urls:
return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')
# Look for embedded TNAFlixNetwork player # Look for embedded TNAFlixNetwork player
tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
if tnaflix_urls: if tnaflix_urls:
return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())
# Look for embedded PornHub player # Look for embedded PornHub player
pornhub_urls = PornHubIE._extract_urls(webpage) pornhub_urls = PornHubIE._extract_urls(webpage)
if pornhub_urls: if pornhub_urls:
return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key()) return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())
# Look for embedded DrTuber player # Look for embedded DrTuber player
drtuber_urls = DrTuberIE._extract_urls(webpage) drtuber_urls = DrTuberIE._extract_urls(webpage)
if drtuber_urls: if drtuber_urls:
return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key()) return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())
# Look for embedded RedTube player # Look for embedded RedTube player
redtube_urls = RedTubeIE._extract_urls(webpage) redtube_urls = RedTubeIE._extract_urls(webpage)
if redtube_urls: if redtube_urls:
return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key()) return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
# Look for embedded Tvigle player # Look for embedded Tvigle player
mobj = re.search( mobj = re.search(
@ -2202,12 +2195,12 @@ class GenericIE(InfoExtractor):
# Look for embedded soundcloud player # Look for embedded soundcloud player
soundcloud_urls = SoundcloudIE._extract_urls(webpage) soundcloud_urls = SoundcloudIE._extract_urls(webpage)
if soundcloud_urls: if soundcloud_urls:
return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
# Look for tunein player # Look for tunein player
tunein_urls = TuneInBaseIE._extract_urls(webpage) tunein_urls = TuneInBaseIE._extract_urls(webpage)
if tunein_urls: if tunein_urls:
return _playlist_from_matches(tunein_urls) return self.playlist_from_matches(tunein_urls, video_id, video_title)
# Look for embedded mtvservices player # Look for embedded mtvservices player
mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
@ -2490,35 +2483,35 @@ class GenericIE(InfoExtractor):
# Look for DBTV embeds # Look for DBTV embeds
dbtv_urls = DBTVIE._extract_urls(webpage) dbtv_urls = DBTVIE._extract_urls(webpage)
if dbtv_urls: if dbtv_urls:
return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())
# Look for Videa embeds # Look for Videa embeds
videa_urls = VideaIE._extract_urls(webpage) videa_urls = VideaIE._extract_urls(webpage)
if videa_urls: if videa_urls:
return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key()) return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())
# Look for 20 minuten embeds # Look for 20 minuten embeds
twentymin_urls = TwentyMinutenIE._extract_urls(webpage) twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
if twentymin_urls: if twentymin_urls:
return _playlist_from_matches( return self.playlist_from_matches(
twentymin_urls, ie=TwentyMinutenIE.ie_key()) twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
# Look for Openload embeds # Look for Openload embeds
openload_urls = OpenloadIE._extract_urls(webpage) openload_urls = OpenloadIE._extract_urls(webpage)
if openload_urls: if openload_urls:
return _playlist_from_matches( return self.playlist_from_matches(
openload_urls, ie=OpenloadIE.ie_key()) openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
# Look for VideoPress embeds # Look for VideoPress embeds
videopress_urls = VideoPressIE._extract_urls(webpage) videopress_urls = VideoPressIE._extract_urls(webpage)
if videopress_urls: if videopress_urls:
return _playlist_from_matches( return self.playlist_from_matches(
videopress_urls, ie=VideoPressIE.ie_key()) videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())
# Look for Rutube embeds # Look for Rutube embeds
rutube_urls = RutubeIE._extract_urls(webpage) rutube_urls = RutubeIE._extract_urls(webpage)
if rutube_urls: if rutube_urls:
return _playlist_from_matches( return self.playlist_from_matches(
rutube_urls, ie=RutubeIE.ie_key()) rutube_urls, ie=RutubeIE.ie_key())
# Looking for http://schema.org/VideoObject # Looking for http://schema.org/VideoObject

Loading…
Cancel
Save