diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bb595f924b..18d2f780c5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2177,6 +2177,7 @@ from .tubitv import ( ) from .tumblr import TumblrIE from .tunein import ( + TuneInEmbedIE, TuneInPodcastEpisodeIE, TuneInPodcastIE, TuneInShortenerIE, diff --git a/yt_dlp/extractor/tunein.py b/yt_dlp/extractor/tunein.py index 90fb04bf3d..2f255a01ab 100644 --- a/yt_dlp/extractor/tunein.py +++ b/yt_dlp/extractor/tunein.py @@ -1,244 +1,337 @@ +import functools import urllib.parse from .common import InfoExtractor from ..utils import ( OnDemandPagedList, - determine_ext, + UnsupportedError, + clean_html, + int_or_none, + join_nonempty, parse_iso8601, - traverse_obj, + update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj class TuneInBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?tunein\.com' - - def _extract_metadata(self, webpage, content_id): - return self._search_json(r'window.INITIAL_STATE=', webpage, 'hydration', content_id, fatal=False) + def _call_api(self, item_id, endpoint=None, note='Downloading JSON metadata', query=None): + return self._download_json( + join_nonempty('https://api.tunein.com/profiles', item_id, endpoint, delim='/'), + item_id, note=note, query=query) def _extract_formats_and_subtitles(self, content_id): streams = self._download_json( - f'https://opml.radiotime.com/Tune.ashx?render=json&formats=mp3,aac,ogg,flash,hls&id={content_id}', - content_id)['body'] + 'https://opml.radiotime.com/Tune.ashx', content_id, query={ + 'formats': 'mp3,aac,ogg,flash,hls', + 'id': content_id, + 'render': 'json', + }) formats, subtitles = [], {} - for stream in streams: + for stream in traverse_obj(streams, ('body', lambda _, v: url_or_none(v['url']))): if stream.get('media_type') == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - elif determine_ext(stream['url']) == 'pls': - playlist_content = self._download_webpage(stream['url'], content_id) - formats.append({ - 'url': self._search_regex(r'File1=(.*)', playlist_content, 'url', fatal=False), - 'abr': stream.get('bitrate'), - 'ext': stream.get('media_type'), - }) else: - formats.append({ - 'url': stream['url'], - 'abr': stream.get('bitrate'), - 'ext': stream.get('media_type'), - }) + formats.append(traverse_obj(stream, { + 'abr': ('bitrate', {int_or_none}), + 'ext': ('media_type', {str}), + 'url': ('url', {self._proto_relative_url}), + })) return formats, subtitles class TuneInStationIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'(?:/radio/[^?#]+-|/embed/player/)(?Ps\d+)' - _EMBED_REGEX = [r']+src=["\'](?P(?:https?://)?tunein\.com/embed/player/s\d+)'] - + IE_NAME = 'tunein:station' + _VALID_URL = r'https?://tunein\.com/radio/[^/?#]+(?Ps\d+)' _TESTS = [{ 'url': 'https://tunein.com/radio/Jazz24-885-s34682/', 'info_dict': { 'id': 's34682', + 'ext': 'mp3', 'title': str, + 'alt_title': 'World Class Jazz', + 'channel_follower_count': int, 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', 'location': 'Seattle-Tacoma, US', - 'ext': 'mp3', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tunein.com/embed/player/s6404/', - 'only_matching': True, + 'params': {'skip_download': 'Livestream'}, }, { 'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/', 'info_dict': { 'id': 's24939', + 'ext': 'm4a', 'title': str, + 'alt_title': 'The biggest new pop and all-day vibes', + 'channel_follower_count': int, 'description': 'md5:ee2c56794844610d045f8caf5ff34d0c', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', 'location': 'London, UK', - 'ext': 'm4a', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', }, - 'params': { - 'skip_download': True, - }, - }] - _WEBPAGE_TESTS = [{ - 'url': 'https://www.martiniinthemorning.com/', - 'info_dict': { - 'id': 's55412', - 'ext': 'mp3', - 'title': 'TuneInStation video #s55412', - }, - 'expected_warnings': ['unable to extract hydration', 'Extractor failed to obtain "title"'], + 'params': {'skip_download': 'Livestream'}, }] def _real_extract(self, url): station_id = self._match_id(url) - - webpage = self._download_webpage(url, station_id) - metadata = self._extract_metadata(webpage, station_id) - formats, subtitles = self._extract_formats_and_subtitles(station_id) + return { 'id': station_id, - 'title': traverse_obj(metadata, ('profiles', station_id, 'title')), - 'description': traverse_obj(metadata, ('profiles', station_id, 'description')), - 'thumbnail': traverse_obj(metadata, ('profiles', station_id, 'image')), - 'timestamp': parse_iso8601( - traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'publishTime'))), - 'location': traverse_obj( - metadata, ('profiles', station_id, 'metadata', 'properties', 'location', 'displayName'), - ('profiles', station_id, 'properties', 'location', 'displayName')), 'formats': formats, 'subtitles': subtitles, - 'is_live': traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'isLive')), + **traverse_obj(self._call_api(station_id), ('Item', { + 'title': ('Title', {clean_html}), + 'alt_title': ('Subtitle', {clean_html}, filter), + 'channel_follower_count': ('Actions', 'Follow', 'FollowerCount', {int_or_none}), + 'description': ('Description', {clean_html}, filter), + 'is_live': ('Actions', 'Play', 'IsLive', {bool}), + 'location': ('Properties', 'Location', 'DisplayName', {str}), + 'thumbnail': ('Image', {url_or_none}), + })), } class TuneInPodcastIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/(?:podcasts/[^?#]+-|embed/player/)(?Pp\d+)/?(?:#|$)' - _EMBED_REGEX = [r']+src=["\'](?P(?:https?://)?tunein\.com/embed/player/p\d+)'] - + IE_NAME = 'tunein:podcast:program' + _PAGE_SIZE = 20 + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+){1,2}(?Pp\d+)' _TESTS = [{ - 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019', + 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/', 'info_dict': { 'id': 'p1153019', 'title': 'Lex Fridman Podcast', - 'description': 'md5:bedc4e5f1c94f7dec6e4317b5654b00d', }, 'playlist_mincount': 200, - }, { - 'url': 'https://tunein.com/embed/player/p191660/', - 'only_matching': True, }, { 'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/', 'info_dict': { 'id': 'p14', 'title': 'BBC News', - 'description': 'md5:30b9622bcc4bd101d4acd6f38f284aed', }, - 'playlist_mincount': 36, + 'playlist_mincount': 35, }] - _PAGE_SIZE = 30 + @classmethod + def suitable(cls, url): + return False if TuneInPodcastEpisodeIE.suitable(url) else super().suitable(url) + + def _fetch_page(self, url, podcast_id, page=0): + items = self._call_api( + podcast_id, 'contents', f'Downloading page {page + 1}', query={ + 'filter': 't:free', + 'limit': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + }, + )['Items'] + + if items: + for item in traverse_obj(items, ( + ..., 'GuideId', {str}, filter, all, filter, + )): + yield self.url_result(update_url_query(url, {'topicId': item[1:]})) def _real_extract(self, url): podcast_id = self._match_id(url) - webpage = self._download_webpage(url, podcast_id, fatal=False) - metadata = self._extract_metadata(webpage, podcast_id) - - def page_func(page_num): - api_response = self._download_json( - f'https://api.tunein.com/profiles/{podcast_id}/contents', podcast_id, - note=f'Downloading page {page_num + 1}', query={ - 'filter': 't:free', - 'offset': page_num * self._PAGE_SIZE, - 'limit': self._PAGE_SIZE, - }) - - return [ - self.url_result( - f'https://tunein.com/podcasts/{podcast_id}?topicId={episode["GuideId"][1:]}', - TuneInPodcastEpisodeIE, title=episode.get('Title')) - for episode in api_response['Items']] - - entries = OnDemandPagedList(page_func, self._PAGE_SIZE) - return self.playlist_result( - entries, playlist_id=podcast_id, title=traverse_obj(metadata, ('profiles', podcast_id, 'title')), - description=traverse_obj(metadata, ('profiles', podcast_id, 'description'))) + return self.playlist_result(OnDemandPagedList( + functools.partial(self._fetch_page, url, podcast_id), self._PAGE_SIZE), + podcast_id, traverse_obj(self._call_api(podcast_id), ('Item', 'Title', {str}))) class TuneInPodcastEpisodeIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/podcasts/(?:[^?&]+-)?(?Pp\d+)/?\?topicId=(?P\w\d+)' - + IE_NAME = 'tunein:podcast' + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+){1,2}(?Pp\d+)/?\?(?:[^#]+&)?(?i:topicid)=(?P\d+)' _TESTS = [{ 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354', 'info_dict': { 'id': 't236404354', + 'ext': 'mp3', 'title': '#351 – MrBeast: Future of YouTube, Twitter, TikTok, and Instagram', - 'description': 'md5:2784533b98f8ac45c0820b1e4a8d8bb2', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'thumbnail': r're:https?://.+', 'timestamp': 1673458571, 'upload_date': '20230111', - 'series_id': 'p1153019', 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + }, + }, { + 'url': 'https://tunein.com/podcasts/The-BOB--TOM-Show-Free-Podcast-p20069/?topicId=174556405', + 'info_dict': { + 'id': 't174556405', 'ext': 'mp3', + 'title': 'B&T Extra: Ohhh Yeah, It\'s Sexy Time', + 'alt_title': 'Westwood One >', + 'cast': 'count:2', + 'description': 'md5:6828234f410ab88c85655495c5fcfa88', + 'display_id': '174556405', + 'duration': 1203, + 'series': 'The BOB & TOM Show Free Podcast', + 'series_id': 'p20069', + 'thumbnail': r're:https?://.+', + 'timestamp': 1661799600, + 'upload_date': '20220829', }, }] def _real_extract(self, url): - podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id') - episode_id = f't{episode_id}' - - webpage = self._download_webpage(url, episode_id) - metadata = self._extract_metadata(webpage, episode_id) - + series_id, display_id = self._match_valid_url(url).group('series_id', 'id') + episode_id = f't{display_id}' formats, subtitles = self._extract_formats_and_subtitles(episode_id) + return { 'id': episode_id, - 'title': traverse_obj(metadata, ('profiles', episode_id, 'title')), - 'description': traverse_obj(metadata, ('profiles', episode_id, 'description')), - 'thumbnail': traverse_obj(metadata, ('profiles', episode_id, 'image')), - 'timestamp': parse_iso8601( - traverse_obj(metadata, ('profiles', episode_id, 'actions', 'play', 'publishTime'))), - 'series_id': podcast_id, - 'series': traverse_obj(metadata, ('profiles', podcast_id, 'title')), + 'display_id': display_id, 'formats': formats, + 'series': traverse_obj(self._call_api(series_id), ('Item', 'Title', {clean_html})), + 'series_id': series_id, 'subtitles': subtitles, + **traverse_obj(self._call_api(episode_id), ('Item', { + 'title': ('Title', {clean_html}), + 'alt_title': ('Subtitle', {clean_html}, filter), + 'cast': ( + 'Properties', 'ParentProgram', 'Hosts', {clean_html}, + {lambda x: x.split(';')}, ..., {str.strip}, filter, all, filter), + 'description': ('Description', {clean_html}, filter), + 'duration': ('Actions', 'Play', 'Duration', {int_or_none}), + 'thumbnail': ('Image', {url_or_none}), + 'timestamp': ('Actions', 'Play', 'PublishTime', {parse_iso8601}), + })), } +class TuneInEmbedIE(TuneInBaseIE): + IE_NAME = 'tunein:embed' + _VALID_URL = r'https?://tunein\.com/embed/player/(?P[^/?#]+)' + _EMBED_REGEX = [r']+\bsrc=["\'](?P(?:https?:)?//tunein\.com/embed/player/[^/?#"\']+)'] + _TESTS = [{ + 'url': 'https://tunein.com/embed/player/s6404/', + 'info_dict': { + 'id': 's6404', + 'ext': 'mp3', + 'title': str, + 'alt_title': 'South Africa\'s News and Information Leader', + 'channel_follower_count': int, + 'live_status': 'is_live', + 'location': 'Johannesburg, South Africa', + 'thumbnail': r're:https?://.+', + }, + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'https://tunein.com/embed/player/t236404354/', + 'info_dict': { + 'id': 't236404354', + 'ext': 'mp3', + 'title': '#351 – MrBeast: Future of YouTube, Twitter, TikTok, and Instagram', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + 'thumbnail': r're:https?://.+', + 'timestamp': 1673458571, + 'upload_date': '20230111', + }, + }, { + 'url': 'https://tunein.com/embed/player/p191660/', + 'info_dict': { + 'id': 'p191660', + 'title': 'SBS Tamil', + }, + 'playlist_mincount': 196, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.martiniinthemorning.com/', + 'info_dict': { + 'id': 's55412', + 'ext': 'mp3', + 'title': str, + 'alt_title': 'Now that\'s music!', + 'channel_follower_count': int, + 'description': 'md5:41588a3e2cf34b3eafc6c33522fa611a', + 'live_status': 'is_live', + 'location': 'US', + 'thumbnail': r're:https?://.+', + }, + 'params': {'skip_download': 'Livestream'}, + }] + + def _real_extract(self, url): + embed_id = self._match_id(url) + kind = { + 'p': 'program', + 's': 'station', + 't': 'topic', + }.get(embed_id[:1]) + + return self.url_result( + f'https://tunein.com/{kind}/?{kind}id={embed_id[1:]}') + + class TuneInShortenerIE(InfoExtractor): - _WORKING = False IE_NAME = 'tunein:shortener' IE_DESC = False # Do not list - _VALID_URL = r'https?://tun\.in/(?P[A-Za-z0-9]+)' - + _VALID_URL = r'https?://tun\.in/(?P[^/?#]+)' _TESTS = [{ - # test redirection 'url': 'http://tun.in/ser7s', 'info_dict': { 'id': 's34682', 'title': str, + 'ext': 'mp3', + 'alt_title': 'World Class Jazz', + 'channel_follower_count': int, 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', 'location': 'Seattle-Tacoma, US', - 'ext': 'mp3', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', + }, + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'http://tun.in/tqeeFw', + 'info_dict': { + 'id': 't236404354', + 'title': str, + 'ext': 'mp3', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + 'thumbnail': r're:https?://.+', + 'timestamp': 1673458571, + 'upload_date': '20230111', }, - 'params': { - 'skip_download': True, # live stream + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'http://tun.in/pei6i', + 'info_dict': { + 'id': 'p14', + 'title': 'BBC News', }, + 'playlist_mincount': 35, }] def _real_extract(self, url): redirect_id = self._match_id(url) # The server doesn't support HEAD requests - urlh = self._request_webpage( - url, redirect_id, note='Downloading redirect page') - - url = urlh.url - url_parsed = urllib.parse.urlparse(url) - if url_parsed.port == 443: - url = url_parsed._replace(netloc=url_parsed.hostname).url + urlh = self._request_webpage(url, redirect_id, 'Downloading redirect page') + parsed = urllib.parse.urlparse(urlh.url) - self.to_screen(f'Following redirect: {url}') - return self.url_result(url) + new_url = urllib.parse.urlunparse(parsed._replace(netloc=parsed.hostname)) + if self.suitable(new_url): # Prevent infinite loop in case redirect fails + raise UnsupportedError(new_url) + return self.url_result(new_url)