diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index d7c35d763..ea3d5ca41 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -41,11 +41,18 @@ jobs: - name: Install Jython if: ${{ matrix.python-impl == 'jython' }} run: | - wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose + if: ${{ matrix.python-impl != 'jython' }} run: pip install nose + - name: Install nose (Jython) + if: ${{ matrix.python-impl == 'jython' }} + # Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + run: | + wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl + pip install nose-1.3.7-py2-none-any.whl - name: Run tests continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index d0edc80d6..6c8ddb25f 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -41,11 +41,18 @@ jobs: - name: Install Jython if: ${{ matrix.python-impl == 'jython' }} run: | - wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose + if: ${{ matrix.python-impl != 'jython' }} run: pip install nose + - name: Install nose (Jython) + if: ${{ matrix.python-impl == 'jython' }} + # Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + run: | + wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl + pip install nose-1.3.7-py2-none-any.whl - name: Run tests continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 33fcc7322..e5079a859 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1056,11 +1056,20 @@ class YoutubeDL(object): def extract_info(self, url, download=True, ie_key=None, extra_info={}, process=True, force_generic_extractor=False): - ''' - Returns a list with a dictionary for each video we find. - If 'download', also downloads the videos. - extra_info is a dict containing the extra values to add to each result - ''' + """ + Return a list with a dictionary for each video extracted. + + Arguments: + url -- URL to extract + + Keyword arguments: + download -- whether to download videos during extraction + ie_key -- extractor key hint + extra_info -- dictionary containing the extra values to add to each result + process -- whether to resolve all unresolved references (URLs, playlist items), + must be True for download to work. + force_generic_extractor -- force using the generic extractor + """ if not ie_key and force_generic_extractor: ie_key = 'Generic' diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 1b4362144..e1b391937 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -133,6 +133,8 @@ class CDAIE(InfoExtractor): 'age_limit': 18 if need_confirm_age else 0, } + info = self._search_json_ld(webpage, video_id, default={}) + # Source: https://www.cda.pl/js/player.js?t=1606154898 def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): @@ -197,7 +199,7 @@ class CDAIE(InfoExtractor): handler = self._download_webpage webpage = handler( - self._BASE_URL + href, video_id, + urljoin(self._BASE_URL, href), video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when @@ -209,6 +211,4 @@ class CDAIE(InfoExtractor): self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info_dict, info) diff --git a/yt_dlp/extractor/dispeak.py b/yt_dlp/extractor/dispeak.py index b1c02ca2b..be7ad1202 100644 --- a/yt_dlp/extractor/dispeak.py +++ b/yt_dlp/extractor/dispeak.py @@ -32,6 +32,18 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1013700/Advanced-Material 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, + }, { + # From https://gdcvault.com/play/1016624, empty speakerVideo + 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', + 'info_dict': { + 'id': '201210-822101_1349794556671DDDD', + 'ext': 'flv', + 'title': 'Pre-launch - Preparing to Take the Plunge', + }, + }, { + # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo + 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): @@ -85,25 +97,19 @@ class DigitallySpeakingIE(InfoExtractor): 'quality': 1, 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'format_id': 'slides', - 'acodec': 'none', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'format_id': 'speaker', - }) + for video_key, format_id, preference in ( + ('slide', 'slides', -2), ('speaker', 'speaker', -1)): + video_path = xpath_text(metadata, './%sVideo' % video_key) + if not video_path: + continue + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(video_path, '.flv'), + 'ext': 'flv', + 'format_note': '%s video' % video_key, + 'quality': preference, + 'format_id': format_id, + }) return formats def _real_extract(self, url): diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 79f9c74a3..b835ca72c 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -151,7 +151,6 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) -from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index e57e165fc..ee8a22f9d 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -402,6 +402,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, { 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', 'only_matching': True, + }, { + # "
]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', - r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - r']+id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), + r'(?:data-id|[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:[^/]+/)?shows/[^/]+/(?P[^/?#&]+)' _NETRC_MACHINE = 'funimation' _TOKEN = None @@ -51,6 +51,10 @@ class FunimationIE(InfoExtractor): }, { 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', 'only_matching': True, + }, { + # with lang code + 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/', + 'only_matching': True, }] def _login(self): diff --git a/yt_dlp/extractor/gdcvault.py b/yt_dlp/extractor/gdcvault.py index a248a170d..acc6478b8 100644 --- a/yt_dlp/extractor/gdcvault.py +++ b/yt_dlp/extractor/gdcvault.py @@ -5,7 +5,10 @@ import re from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( + HEADRequest, + remove_start, sanitized_Request, + smuggle_url, urlencode_postdata, ) @@ -100,6 +103,26 @@ class GDCVaultIE(InfoExtractor): 'format': 'mp4-408', }, }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, ] def _login(self, webpage_url, display_id): @@ -120,38 +143,78 @@ class GDCVaultIE(InfoExtractor): request = sanitized_Request(login_url, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._download_webpage(request, display_id, 'Logging in') - webpage = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') + start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') self._download_webpage(logout_url, display_id, 'Logging out') - return webpage + return start_page def _real_extract(self, url): video_id, name = re.match(self._VALID_URL, url).groups() display_id = name or video_id - webpage = self._download_webpage(url, display_id) - - title = self._html_search_regex( - r'Session Name:?\s*(.*?)', - webpage, 'title') - - PLAYER_REGEX = r'' - manifest_url = self._html_search_regex( - PLAYER_REGEX, webpage, 'manifest_url') - - partner_id = self._search_regex( - r'/p(?:artner_id)?/(\d+)', manifest_url, 'partner id', - default='1670711') + webpage_url = 'http://www.gdcvault.com/play/' + video_id + start_page = self._download_webpage(webpage_url, display_id) + + direct_url = self._search_regex( + r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', + start_page, 'url', default=None) + if direct_url: + title = self._html_search_regex( + r'Session Name:?\s*(.*?)', + start_page, 'title') + video_url = 'http://www.gdcvault.com' + direct_url + # resolve the url so that we can detect the correct extension + video_url = self._request_webpage( + HEADRequest(video_url), video_id).geturl() + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + } - kaltura_id = self._search_regex( - r'entry_id=(?P(?:[^&])+)', manifest_url, - 'kaltura id', group='id') + embed_url = KalturaIE._extract_url(start_page) + if embed_url: + embed_url = smuggle_url(embed_url, {'source_url': url}) + ie_key = 'Kaltura' + else: + PLAYER_REGEX = r'', + start_page, 'xml filename', default=None) + if not xml_name: + info = self._parse_html5_media_entries(url, start_page, video_id)[0] + info.update({ + 'title': remove_start(self._search_regex( + r'>Session Name:\s*<.*?>\s*(.+?)', start_page, + 'title', default=None) or self._og_search_title( + start_page, default=None), 'GDC Vault - '), + 'id': video_id, + 'display_id': display_id, + }) + return info + embed_url = '%s/xml/%s' % (xml_root, xml_name) + ie_key = 'DigitallySpeaking' return { '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': KalturaIE.ie_key(), 'id': video_id, 'display_id': display_id, - 'title': title, + 'url': embed_url, + 'ie_key': ie_key, } diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index c8097249e..f10916081 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -120,7 +120,7 @@ class KalturaIE(InfoExtractor): def _extract_urls(webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( - re.finditer( + list(re.finditer( r"""(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? @@ -128,8 +128,8 @@ class KalturaIE(InfoExtractor): (?P['"])_?(?P(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P['"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage) - or re.finditer( + """, webpage)) + or list(re.finditer( r'''(?xs) (?P["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* @@ -142,16 +142,16 @@ class KalturaIE(InfoExtractor): \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage) - or re.finditer( + ''', webpage)) + or list(re.finditer( r'''(?xs) - <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["'])\s* (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) (?:(?!(?P=q1)).)* [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) (?:(?!(?P=q1)).)* (?P=q1) - ''', webpage) + ''', webpage)) ) urls = [] for mobj in finditer: diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py index 4bca6f053..2ece5aac4 100644 --- a/yt_dlp/extractor/medaltv.py +++ b/yt_dlp/extractor/medaltv.py @@ -15,33 +15,39 @@ from ..utils import ( class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[^/?#&]+)' _TESTS = [{ - 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', 'info_dict': { - 'id': '34934644', + 'id': '2mA60jWAGQCBH', 'ext': 'mp4', 'title': 'Quad Cold', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'MowgliSB', 'timestamp': 1603165266, 'upload_date': '20201020', - 'uploader_id': 10619174, + 'uploader_id': '10619174', } }, { - 'url': 'https://medal.tv/clips/36787208', + 'url': 'https://medal.tv/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { - 'id': '36787208', + 'id': '2um24TWdty0NA', 'ext': 'mp4', 'title': 'u tk me i tk u bigger', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'Mimicc', 'timestamp': 1605580939, 'upload_date': '20201117', - 'uploader_id': 5156321, + 'uploader_id': '5156321', } + }, { + 'url': 'https://medal.tv/clips/37rMeFpryCC-9', + 'only_matching': True, + }, { + 'url': 'https://medal.tv/clips/2WRj40tpY_EU9', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 5b377ea83..4b6284a8d 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -146,7 +146,7 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) - (?:.*?modalId=(?P[\da-zA-Z-]+))? + (?:.*?(?:modalId|id)=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ @@ -177,6 +177,9 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -259,7 +262,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', - r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', diff --git a/yt_dlp/extractor/tv2dk.py b/yt_dlp/extractor/tv2dk.py index 8bda9348d..8bd5fd640 100644 --- a/yt_dlp/extractor/tv2dk.py +++ b/yt_dlp/extractor/tv2dk.py @@ -74,6 +74,12 @@ class TV2DKIE(InfoExtractor): webpage = self._download_webpage(url, video_id) entries = [] + + def add_entry(partner_id, kaltura_id): + entries.append(self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', + video_id=kaltura_id)) + for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): video = extract_attributes(video_el) kaltura_id = video.get('data-entryid') @@ -82,9 +88,14 @@ class TV2DKIE(InfoExtractor): partner_id = video.get('data-partnerid') if not partner_id: continue - entries.append(self.url_result( - 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', - video_id=kaltura_id)) + add_entry(partner_id, kaltura_id) + if not entries: + kaltura_id = self._search_regex( + r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + partner_id = self._search_regex( + (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, + 'partner id') + add_entry(partner_id, kaltura_id) return self.playlist_result(entries) diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index a54f49319..a4a30b1e6 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -9,7 +9,6 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, - strip_or_none, try_get, ) @@ -45,32 +44,18 @@ class TVerIE(InfoExtractor): query={'token': self._TOKEN})['main'] p_id = main['publisher_id'] service = remove_start(main['service'], 'ts_') - info = { + + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + + return { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'url': bc_url, + 'ie_key': 'BrightcoveNew', } - - if service == 'cx': - title = main['title'] - subtitle = strip_or_none(main.get('subtitle')) - if subtitle: - title += ' - ' + subtitle - info.update({ - 'title': title, - 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), - 'ie_key': 'FujiTVFODPlus7', - }) - else: - r_id = main['reference_id'] - if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): - r_id = 'ref:' + r_id - bc_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}) - info.update({ - 'url': bc_url, - 'ie_key': 'BrightcoveNew', - }) - - return info diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 63c11bd47..ae79ec6e0 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -19,6 +19,7 @@ from ..utils import ( strip_or_none, unified_timestamp, update_url_query, + url_or_none, xpath_text, ) @@ -52,6 +53,9 @@ class TwitterBaseIE(InfoExtractor): return [f], {} def _extract_formats_from_vmap_url(self, vmap_url, video_id): + vmap_url = url_or_none(vmap_url) + if not vmap_url: + return [] vmap_data = self._download_xml(vmap_url, video_id) formats = [] subtitles = {} diff --git a/yt_dlp/extractor/xfileshare.py b/yt_dlp/extractor/xfileshare.py index cbd5d1cbb..df9efa9fa 100644 --- a/yt_dlp/extractor/xfileshare.py +++ b/yt_dlp/extractor/xfileshare.py @@ -58,6 +58,7 @@ class XFileShareIE(InfoExtractor): (r'vidlocker\.xyz', 'VidLocker'), (r'vidshare\.tv', 'VidShare'), (r'vup\.to', 'VUp'), + (r'wolfstream\.tv', 'WolfStream'), (r'xvideosharing\.com', 'XVideoSharing'), ) @@ -82,6 +83,9 @@ class XFileShareIE(InfoExtractor): }, { 'url': 'https://aparat.cam/n4d6dh0wvlpr', 'only_matching': True, + }, { + 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'only_matching': True, }] @staticmethod diff --git a/yt_dlp/extractor/xtube.py b/yt_dlp/extractor/xtube.py index 98d2adb99..682e45bef 100644 --- a/yt_dlp/extractor/xtube.py +++ b/yt_dlp/extractor/xtube.py @@ -11,6 +11,7 @@ from ..utils import ( parse_duration, sanitized_Request, str_to_int, + url_or_none, ) @@ -71,10 +72,10 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - title, thumbnail, duration = [None] * 3 + title, thumbnail, duration, sources, media_definition = [None] * 5 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') @@ -83,20 +84,52 @@ class XTubeIE(InfoExtractor): thumbnail = config.get('poster') duration = int_or_none(config.get('duration')) sources = config.get('sources') or config.get('format') + media_definition = config.get('mediaDefinition') - if not isinstance(sources, dict): + if not isinstance(sources, dict) and not media_definition: sources = self._parse_json(self._search_regex( r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', webpage, 'sources', group='sources'), video_id, transform_source=js_to_json) formats = [] - for format_id, format_url in sources.items(): - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) + format_urls = set() + + if isinstance(sources, dict): + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + + if isinstance(media_definition, list): + for media in media_definition: + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + if video_url in format_urls: + continue + format_urls.add(video_url) + format_id = media.get('format') + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id == 'mp4': + height = int_or_none(media.get('quality')) + formats.append({ + 'url': video_url, + 'format_id': '%s-%d' % (format_id, height) if height else format_id, + 'height': height, + }) + self._remove_duplicate_formats(formats) self._sort_formats(formats)