[abcnews] fix extraction(closes #12394)(closes #27920)

pull/27983/head
Remita Amine 4 years ago
parent d18f4419a7
commit 11b68df7a4

@ -1,14 +1,15 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import calendar
import re import re
import time
from .amp import AMPIE from .amp import AMPIE
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from ..utils import (
from ..compat import compat_urlparse parse_duration,
parse_iso8601,
try_get,
)
class AbcNewsVideoIE(AMPIE): class AbcNewsVideoIE(AMPIE):
@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE):
(?: (?:
abcnews\.go\.com/ abcnews\.go\.com/
(?: (?:
[^/]+/video/(?P<display_id>[0-9a-z-]+)-| (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-|
video/embed\?.*?\bid= video/(?:embed|itemfeed)\?.*?\bid=
)| )|
fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/
) )
@ -49,6 +50,12 @@ class AbcNewsVideoIE(AMPIE):
}, { }, {
'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://abcnews.go.com/video/itemfeed?id=46979033',
'only_matching': True,
}, {
'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -69,28 +76,23 @@ class AbcNewsIE(InfoExtractor):
_VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', # Youtube Embeds
'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501',
'info_dict': { 'info_dict': {
'id': '10505354', 'id': '51286501',
'ext': 'flv', 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player",
'display_id': 'dramatic-video-rare-death-job-america', 'description': 'Billingsley went from a child actor to Hollywood power player.',
'title': 'Occupational Hazards',
'description': 'Nightline investigates the dangers that lurk at various jobs.',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20100428',
'timestamp': 1272412800,
}, },
'add_ie': ['AbcNewsVideo'], 'playlist_count': 5,
}, { }, {
'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
'info_dict': { 'info_dict': {
'id': '38897857', 'id': '38897857',
'ext': 'mp4', 'ext': 'mp4',
'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
'title': 'Justin Timberlake Drops Hints For Secret Single', 'title': 'Justin Timberlake Drops Hints For Secret Single',
'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
'upload_date': '20160515', 'upload_date': '20160505',
'timestamp': 1463329500, 'timestamp': 1462442280,
}, },
'params': { 'params': {
# m3u8 download # m3u8 download
@ -102,49 +104,55 @@ class AbcNewsIE(InfoExtractor):
}, { }, {
'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
'only_matching': True, 'only_matching': True,
}, {
# inline.type == 'video'
'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) story_id = self._match_id(url)
display_id = mobj.group('display_id') webpage = self._download_webpage(url, story_id)
video_id = mobj.group('id') story = self._parse_json(self._search_regex(
r"window\['__abcnews__'\]\s*=\s*({.+?});",
webpage = self._download_webpage(url, video_id) webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0]
video_url = self._search_regex( article_contents = story.get('articleContents') or {}
r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
full_video_url = compat_urlparse.urljoin(url, video_url) def entries():
featured_video = story.get('featuredVideo') or {}
youtube_url = YoutubeIE._extract_url(webpage) feed = try_get(featured_video, lambda x: x['video']['feed'])
if feed:
timestamp = None yield {
date_str = self._html_search_regex( '_type': 'url',
r'<span[^>]+class="timestamp">([^<]+)</span>', 'id': featured_video.get('id'),
webpage, 'timestamp', fatal=False) 'title': featured_video.get('name'),
if date_str: 'url': feed,
tz_offset = 0 'thumbnail': featured_video.get('images'),
if date_str.endswith(' ET'): # Eastern Time 'description': featured_video.get('description'),
tz_offset = -5 'timestamp': parse_iso8601(featured_video.get('uploadDate')),
date_str = date_str[:-3] 'duration': parse_duration(featured_video.get('duration')),
date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] 'ie_key': AbcNewsVideoIE.ie_key(),
for date_format in date_formats: }
try:
timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) for inline in (article_contents.get('inlines') or []):
except ValueError: inline_type = inline.get('type')
continue if inline_type == 'iframe':
if timestamp is not None: iframe_url = try_get(inline, lambda x: x['attrs']['src'])
timestamp -= tz_offset * 3600 if iframe_url:
yield self.url_result(iframe_url)
entry = { elif inline_type == 'video':
'_type': 'url_transparent', video_id = inline.get('id')
'ie_key': AbcNewsVideoIE.ie_key(), if video_id:
'url': full_video_url, yield {
'id': video_id, '_type': 'url',
'display_id': display_id, 'id': video_id,
'timestamp': timestamp, 'url': 'http://abcnews.go.com/video/embed?id=' + video_id,
} 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'),
'description': inline.get('description'),
if youtube_url: 'duration': parse_duration(inline.get('duration')),
entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] 'ie_key': AbcNewsVideoIE.ie_key(),
return self.playlist_result(entries) }
return entry return self.playlist_result(
entries(), story_id, article_contents.get('headline'),
article_contents.get('subHead'))

Loading…
Cancel
Save