From 135dfa2c7ebc9284db940713c0dc6cbc19ca5fa4 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 20 Jan 2022 03:25:15 +0530 Subject: [PATCH] [extractor,cleanup] Use `_search_nextjs_data` --- yt_dlp/extractor/common.py | 6 +++--- yt_dlp/extractor/itv.py | 4 ++-- yt_dlp/extractor/nbc.py | 4 +--- yt_dlp/extractor/novaplay.py | 4 +--- yt_dlp/extractor/skyit.py | 5 +---- yt_dlp/extractor/stv.py | 5 +---- yt_dlp/extractor/telemundo.py | 3 +-- yt_dlp/extractor/tiktok.py | 7 ++----- 8 files changed, 12 insertions(+), 26 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index a23840e41..1436724dd 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1544,12 +1544,12 @@ class InfoExtractor(object): return dict((k, v) for k, v in info.items() if v is not None) - def _search_nextjs_data(self, webpage, video_id, **kw): + def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): return self._parse_json( self._search_regex( r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', - webpage, 'next.js data', **kw), - video_id, **kw) + webpage, 'next.js data', fatal=fatal, **kw), + video_id, transform_source=transform_source, fatal=fatal) def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py index bdd6af688..f1591403f 100644 --- a/yt_dlp/extractor/itv.py +++ b/yt_dlp/extractor/itv.py @@ -243,8 +243,8 @@ class ITVBTCCIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - json_map = try_get(self._parse_json(self._html_search_regex( - '(?s)]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)', webpage, 'json_map'), playlist_id), + json_map = try_get( + self._search_nextjs_data(webpage, playlist_id), lambda x: x['props']['pageProps']['article']['body']['content']) or [] entries = [] diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index bcd388357..109403440 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -408,9 +408,7 @@ class NBCNewsIE(ThePlatformIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r']+id="__NEXT_DATA__"[^>]*>({.+?})', - webpage, 'bootstrap json'), video_id)['props']['initialState'] + data = self._search_nextjs_data(webpage, video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py index 724986a06..bfb2c8751 100644 --- a/yt_dlp/extractor/novaplay.py +++ b/yt_dlp/extractor/novaplay.py @@ -41,9 +41,7 @@ class NovaPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_props = self._parse_json(self._search_regex( - r'({.+})', - webpage, 'video_props'), video_id)['props']['pageProps']['video'] + video_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] m3u8_url = self._download_json( f'https://nbg-api.fite.tv/api/v2/videos/{video_id}/streams', video_id, headers={'x-flipps-user-agent': 'Flipps/75/9.7'})[0]['url'] diff --git a/yt_dlp/extractor/skyit.py b/yt_dlp/extractor/skyit.py index 496bb42a2..ddb43c075 100644 --- a/yt_dlp/extractor/skyit.py +++ b/yt_dlp/extractor/skyit.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_str, compat_parse_qs, compat_urllib_parse_urlparse, ) @@ -125,9 +124,7 @@ class SkyItVideoLiveIE(SkyItPlayerIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - asset_id = compat_str(self._parse_json(self._search_regex( - r']+id="__NEXT_DATA__"[^>]*>({.+?})', - webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id']) + asset_id = str(self._search_nextjs_data(webpage, display_id)['props']['initialState']['livePage']['content']['asset_id']) livestream = self._download_json( 'https://apid.sky.it/vdp/v1/getLivestream', asset_id, query={'id': asset_id}) diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py index d36a4b6e9..ba5661d74 100644 --- a/yt_dlp/extractor/stv.py +++ b/yt_dlp/extractor/stv.py @@ -45,10 +45,7 @@ class STVPlayerIE(InfoExtractor): ptype, video_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, video_id, fatal=False) or '' - props = (self._parse_json(self._search_regex( - r']+id="__NEXT_DATA__"[^>]*>({.+?})', - webpage, 'next data', default='{}'), video_id, - fatal=False) or {}).get('props') or {} + props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {} player_api_cache = try_get( props, lambda x: x['initialReduxState']['playerApiCache']) or {} diff --git a/yt_dlp/extractor/telemundo.py b/yt_dlp/extractor/telemundo.py index e326bbdd5..ebcecf55f 100644 --- a/yt_dlp/extractor/telemundo.py +++ b/yt_dlp/extractor/telemundo.py @@ -34,8 +34,7 @@ class TelemundoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - metadata = self._parse_json( - self._search_regex(r'<[^>]+id="__NEXT_DATA__"[^>]+>([^<]+)', webpage, 'JSON metadata'), video_id) + metadata = self._search_nextjs_data(webpage, video_id) redirect_url = try_get( metadata, lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['videoAssets'][0]['publicUrl']) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 6dffdf05e..172fc9bb8 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -451,12 +451,9 @@ class TikTokIE(TikTokBaseIE): # If we only call once, we get a 403 when downlaoding the video. self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id, note='Downloading video webpage') - next_json = self._search_regex( - r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P[^<]+)', - webpage, 'next data', group='next_data', default=None) + next_data = self._search_nextjs_data(webpage, video_id, default='{}') - if next_json: - next_data = self._parse_json(next_json, video_id) + if next_data: status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict) else: