From 5c37fb2c3f86e79a5fbccf93c5f74859098b98fb Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sun, 10 Aug 2025 18:14:12 +0900 Subject: [PATCH 1/3] [ie/tunein] Fix extractors, Improve metadata extraction --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/tunein.py | 341 ++++++++++++++++++++------------ 2 files changed, 217 insertions(+), 125 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bb595f924b..18d2f780c5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2177,6 +2177,7 @@ from .tubitv import ( ) from .tumblr import TumblrIE from .tunein import ( + TuneInEmbedIE, TuneInPodcastEpisodeIE, TuneInPodcastIE, TuneInShortenerIE, diff --git a/yt_dlp/extractor/tunein.py b/yt_dlp/extractor/tunein.py index 90fb04bf3d..874b68d22c 100644 --- a/yt_dlp/extractor/tunein.py +++ b/yt_dlp/extractor/tunein.py @@ -1,24 +1,33 @@ +import itertools import urllib.parse from .common import InfoExtractor from ..utils import ( - OnDemandPagedList, - determine_ext, + clean_html, + int_or_none, parse_iso8601, - traverse_obj, + parse_qs, + update_url_query, + url_or_none, ) +from ..utils.traversal import require, traverse_obj class TuneInBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?tunein\.com' + def _call_api(self, item_id, endpoint=None, note='Downloading JSON metadata', query=None): + path = f'/{endpoint}' if endpoint else '' - def _extract_metadata(self, webpage, content_id): - return self._search_json(r'window.INITIAL_STATE=', webpage, 'hydration', content_id, fatal=False) + return self._download_json( + f'https://api.tunein.com/profiles/{item_id}{path}', item_id, note=note, query=query) def _extract_formats_and_subtitles(self, content_id): streams = self._download_json( - f'https://opml.radiotime.com/Tune.ashx?render=json&formats=mp3,aac,ogg,flash,hls&id={content_id}', - content_id)['body'] + 'https://opml.radiotime.com/Tune.ashx', content_id, query={ + 'formats': 'mp3,aac,ogg,flash,hls', + 'id': content_id, + 'render': 'json', + }, + )['body'] formats, subtitles = [], {} for stream in streams: @@ -26,219 +35,301 @@ class TuneInBaseIE(InfoExtractor): fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - elif determine_ext(stream['url']) == 'pls': - playlist_content = self._download_webpage(stream['url'], content_id) - formats.append({ - 'url': self._search_regex(r'File1=(.*)', playlist_content, 'url', fatal=False), - 'abr': stream.get('bitrate'), - 'ext': stream.get('media_type'), - }) else: - formats.append({ - 'url': stream['url'], - 'abr': stream.get('bitrate'), - 'ext': stream.get('media_type'), - }) + formats.append(traverse_obj(stream, { + 'abr': ('bitrate', {int_or_none}), + 'ext': ('media_type', {str}), + 'url': ('url', {self._proto_relative_url}, {url_or_none}), + })) return formats, subtitles class TuneInStationIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'(?:/radio/[^?#]+-|/embed/player/)(?Ps\d+)' - _EMBED_REGEX = [r']+src=["\'](?P(?:https?://)?tunein\.com/embed/player/s\d+)'] - + IE_NAME = 'tunein:station' + _VALID_URL = r'https?://tunein\.com/radio/[^/]+(?Ps\d+)' _TESTS = [{ 'url': 'https://tunein.com/radio/Jazz24-885-s34682/', 'info_dict': { 'id': 's34682', + 'ext': 'mp3', 'title': str, + 'alt_title': 'World Class Jazz', + 'channel_follower_count': int, 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', 'location': 'Seattle-Tacoma, US', - 'ext': 'mp3', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tunein.com/embed/player/s6404/', - 'only_matching': True, + 'params': {'skip_download': 'Livestream'}, }, { 'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/', 'info_dict': { 'id': 's24939', + 'ext': 'm4a', 'title': str, + 'alt_title': 'The biggest new pop and all-day vibes', + 'channel_follower_count': int, 'description': 'md5:ee2c56794844610d045f8caf5ff34d0c', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', 'location': 'London, UK', - 'ext': 'm4a', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', }, - 'params': { - 'skip_download': True, - }, - }] - _WEBPAGE_TESTS = [{ - 'url': 'https://www.martiniinthemorning.com/', - 'info_dict': { - 'id': 's55412', - 'ext': 'mp3', - 'title': 'TuneInStation video #s55412', - }, - 'expected_warnings': ['unable to extract hydration', 'Extractor failed to obtain "title"'], + 'params': {'skip_download': 'Livestream'}, }] def _real_extract(self, url): station_id = self._match_id(url) - - webpage = self._download_webpage(url, station_id) - metadata = self._extract_metadata(webpage, station_id) - formats, subtitles = self._extract_formats_and_subtitles(station_id) + return { 'id': station_id, - 'title': traverse_obj(metadata, ('profiles', station_id, 'title')), - 'description': traverse_obj(metadata, ('profiles', station_id, 'description')), - 'thumbnail': traverse_obj(metadata, ('profiles', station_id, 'image')), - 'timestamp': parse_iso8601( - traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'publishTime'))), - 'location': traverse_obj( - metadata, ('profiles', station_id, 'metadata', 'properties', 'location', 'displayName'), - ('profiles', station_id, 'properties', 'location', 'displayName')), 'formats': formats, 'subtitles': subtitles, - 'is_live': traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'isLive')), + **traverse_obj(self._call_api(station_id), ('Item', { + 'title': ('Title', {clean_html}), + 'alt_title': ('Subtitle', {clean_html}, filter), + 'channel_follower_count': ('Actions', 'Follow', 'FollowerCount', {int_or_none}), + 'description': ('Description', {clean_html}, filter), + 'is_live': ('Actions', 'Play', 'IsLive', {bool}), + 'location': ('Properties', 'Location', 'DisplayName', {str}, any), + 'thumbnail': ('Image', {url_or_none}), + })), } class TuneInPodcastIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/(?:podcasts/[^?#]+-|embed/player/)(?Pp\d+)/?(?:#|$)' - _EMBED_REGEX = [r']+src=["\'](?P(?:https?://)?tunein\.com/embed/player/p\d+)'] - + IE_NAME = 'tunein:podcast:program' + _PAGE_SIZE = 20 + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?(?:\?(?![^#]*[?&](?i:topicid)=)[^#]*)?(?:#|$)' _TESTS = [{ - 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019', + 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/', 'info_dict': { 'id': 'p1153019', 'title': 'Lex Fridman Podcast', - 'description': 'md5:bedc4e5f1c94f7dec6e4317b5654b00d', }, 'playlist_mincount': 200, - }, { - 'url': 'https://tunein.com/embed/player/p191660/', - 'only_matching': True, }, { 'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/', 'info_dict': { 'id': 'p14', 'title': 'BBC News', - 'description': 'md5:30b9622bcc4bd101d4acd6f38f284aed', }, - 'playlist_mincount': 36, + 'playlist_mincount': 35, }] - _PAGE_SIZE = 30 - - def _real_extract(self, url): - podcast_id = self._match_id(url) - - webpage = self._download_webpage(url, podcast_id, fatal=False) - metadata = self._extract_metadata(webpage, podcast_id) - - def page_func(page_num): - api_response = self._download_json( - f'https://api.tunein.com/profiles/{podcast_id}/contents', podcast_id, - note=f'Downloading page {page_num + 1}', query={ + def _entries(self, podcast_id): + for page in itertools.count(): + contents = self._call_api( + podcast_id, 'contents', + f'Downloading page {page + 1}', query={ 'filter': 't:free', - 'offset': page_num * self._PAGE_SIZE, 'limit': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, }) - return [ - self.url_result( - f'https://tunein.com/podcasts/{podcast_id}?topicId={episode["GuideId"][1:]}', - TuneInPodcastEpisodeIE, title=episode.get('Title')) - for episode in api_response['Items']] + yield from traverse_obj(contents, ( + 'Items', ..., 'GuideId', {str}, filter, all, filter)) + + if not traverse_obj(contents, ('Paging', 'Next', {url_or_none})): + break + + def _real_extract(self, url): + podcast_id = self._match_id(url) - entries = OnDemandPagedList(page_func, self._PAGE_SIZE) - return self.playlist_result( - entries, playlist_id=podcast_id, title=traverse_obj(metadata, ('profiles', podcast_id, 'title')), - description=traverse_obj(metadata, ('profiles', podcast_id, 'description'))) + return self.playlist_from_matches( + self._entries(podcast_id), podcast_id, + traverse_obj(self._call_api(podcast_id), ('Item', 'Title', {str})), + getter=lambda x: update_url_query(url, {'topicId': x[1:]})) class TuneInPodcastEpisodeIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/podcasts/(?:[^?&]+-)?(?Pp\d+)/?\?topicId=(?P\w\d+)' - + IE_NAME = 'tunein:podcast' + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?\?(?:[^#]*)?(?<=\?|&)(?i:topicid)=(?P\d+)(?:[&#]|$)' _TESTS = [{ 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354', 'info_dict': { 'id': 't236404354', + 'ext': 'mp3', 'title': '#351 – MrBeast: Future of YouTube, Twitter, TikTok, and Instagram', - 'description': 'md5:2784533b98f8ac45c0820b1e4a8d8bb2', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'thumbnail': r're:https?://.+', 'timestamp': 1673458571, 'upload_date': '20230111', - 'series_id': 'p1153019', 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + }, + }, { + 'url': 'https://tunein.com/podcasts/The-BOB--TOM-Show-Free-Podcast-p20069/?topicId=174556405', + 'info_dict': { + 'id': 't174556405', 'ext': 'mp3', + 'title': 'B&T Extra: Ohhh Yeah, It\'s Sexy Time', + 'alt_title': 'Westwood One >', + 'cast': 'count:2', + 'description': 'md5:6828234f410ab88c85655495c5fcfa88', + 'display_id': '174556405', + 'duration': 1203, + 'series': 'The BOB & TOM Show Free Podcast', + 'series_id': 'p20069', + 'thumbnail': r're:https?://.+', + 'timestamp': 1661799600, + 'upload_date': '20220829', }, }] def _real_extract(self, url): - podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id') - episode_id = f't{episode_id}' - - webpage = self._download_webpage(url, episode_id) - metadata = self._extract_metadata(webpage, episode_id) - + series_id = self._match_valid_url(url).group('series_id') + display_id = traverse_obj(parse_qs(url), ( + ('topicid', 'topicId'), -1, {str}, any, {require('topic ID')})) + episode_id = f't{display_id}' formats, subtitles = self._extract_formats_and_subtitles(episode_id) + return { 'id': episode_id, - 'title': traverse_obj(metadata, ('profiles', episode_id, 'title')), - 'description': traverse_obj(metadata, ('profiles', episode_id, 'description')), - 'thumbnail': traverse_obj(metadata, ('profiles', episode_id, 'image')), - 'timestamp': parse_iso8601( - traverse_obj(metadata, ('profiles', episode_id, 'actions', 'play', 'publishTime'))), - 'series_id': podcast_id, - 'series': traverse_obj(metadata, ('profiles', podcast_id, 'title')), + 'display_id': display_id, 'formats': formats, + 'series': traverse_obj(self._call_api(series_id), ('Item', 'Title', {clean_html})), + 'series_id': series_id, 'subtitles': subtitles, + **traverse_obj(self._call_api(episode_id), ('Item', { + 'title': ('Title', {clean_html}), + 'alt_title': ('Subtitle', {clean_html}, filter), + 'cast': ( + 'Properties', 'ParentProgram', 'Hosts', {clean_html}, + {lambda x: x.split(';')}, ..., {str.strip}, filter, all, filter), + 'description': ('Description', {clean_html}, filter), + 'duration': ('Actions', 'Play', 'Duration', {int_or_none}), + 'thumbnail': ('Image', {url_or_none}), + 'timestamp': ('Actions', 'Play', 'PublishTime', {parse_iso8601}), + })), } +class TuneInEmbedIE(TuneInBaseIE): + IE_NAME = 'tunein:embed' + _VALID_URL = r'https?://tunein\.com/embed/player/(?P[^/?#]+)' + _EMBED_REGEX = [r']+src=["\'](?P(?:https?:)?//tunein\.com/embed/player/[^/?#]+)'] + _TESTS = [{ + 'url': 'https://tunein.com/embed/player/s6404/', + 'info_dict': { + 'id': 's6404', + 'ext': 'mp3', + 'title': str, + 'alt_title': 'South Africa\'s News and Information Leader', + 'channel_follower_count': int, + 'live_status': 'is_live', + 'location': 'Johannesburg, South Africa', + 'thumbnail': r're:https?://.+', + }, + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'https://tunein.com/embed/player/t236404354/', + 'info_dict': { + 'id': 't236404354', + 'ext': 'mp3', + 'title': '#351 – MrBeast: Future of YouTube, Twitter, TikTok, and Instagram', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + 'thumbnail': r're:https?://.+', + 'timestamp': 1673458571, + 'upload_date': '20230111', + }, + }, { + 'url': 'https://tunein.com/embed/player/p191660/', + 'info_dict': { + 'id': 'p191660', + 'title': 'SBS Tamil', + }, + 'playlist_mincount': 197, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.martiniinthemorning.com/', + 'info_dict': { + 'id': 's55412', + 'ext': 'mp3', + 'title': str, + 'alt_title': 'Now that\'s music!', + 'channel_follower_count': int, + 'description': 'md5:41588a3e2cf34b3eafc6c33522fa611a', + 'live_status': 'is_live', + 'location': 'US', + 'thumbnail': r're:https?://.+', + }, + 'params': {'skip_download': 'Livestream'}, + }] + + def _real_extract(self, url): + embed_id = self._match_id(url) + kind = { + 'p': 'program', + 's': 'station', + 't': 'topic', + }.get(embed_id[:1]) + + return self.url_result( + f'https://tunein.com/{kind}/?{kind}id={embed_id[1:]}') + + class TuneInShortenerIE(InfoExtractor): - _WORKING = False IE_NAME = 'tunein:shortener' IE_DESC = False # Do not list - _VALID_URL = r'https?://tun\.in/(?P[A-Za-z0-9]+)' - + _VALID_URL = r'https?://tun\.in/(?P[^/?#]+)' _TESTS = [{ - # test redirection 'url': 'http://tun.in/ser7s', 'info_dict': { 'id': 's34682', 'title': str, + 'ext': 'mp3', + 'alt_title': 'World Class Jazz', + 'channel_follower_count': int, 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', 'location': 'Seattle-Tacoma, US', - 'ext': 'mp3', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', + }, + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'http://tun.in/tqeeFw', + 'info_dict': { + 'id': 't236404354', + 'title': str, + 'ext': 'mp3', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + 'thumbnail': r're:https?://.+', + 'timestamp': 1673458571, + 'upload_date': '20230111', }, - 'params': { - 'skip_download': True, # live stream + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'http://tun.in/pei6i', + 'info_dict': { + 'id': 'p14', + 'title': 'BBC News', }, + 'playlist_mincount': 35, }] def _real_extract(self, url): redirect_id = self._match_id(url) # The server doesn't support HEAD requests - urlh = self._request_webpage( - url, redirect_id, note='Downloading redirect page') - - url = urlh.url - url_parsed = urllib.parse.urlparse(url) - if url_parsed.port == 443: - url = url_parsed._replace(netloc=url_parsed.hostname).url + urlh = self._request_webpage(url, redirect_id, 'Downloading redirect page') + parsed = urllib.parse.urlparse(urlh.url) - self.to_screen(f'Following redirect: {url}') - return self.url_result(url) + return self.url_result( + urllib.parse.urlunparse(parsed._replace(netloc=parsed.hostname))) From ecd1499881210dc6e9d5b5ccc45a7801c81b1dcc Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sun, 10 Aug 2025 18:42:26 +0900 Subject: [PATCH 2/3] fix --- yt_dlp/extractor/tunein.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/tunein.py b/yt_dlp/extractor/tunein.py index 874b68d22c..f6c37ddaf7 100644 --- a/yt_dlp/extractor/tunein.py +++ b/yt_dlp/extractor/tunein.py @@ -6,11 +6,10 @@ from ..utils import ( clean_html, int_or_none, parse_iso8601, - parse_qs, update_url_query, url_or_none, ) -from ..utils.traversal import require, traverse_obj +from ..utils.traversal import traverse_obj class TuneInBaseIE(InfoExtractor): @@ -101,7 +100,7 @@ class TuneInStationIE(TuneInBaseIE): class TuneInPodcastIE(TuneInBaseIE): IE_NAME = 'tunein:podcast:program' _PAGE_SIZE = 20 - _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?(?:\?(?![^#]*[?&](?i:topicid)=)[^#]*)?(?:#|$)' + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?(?:\?(?![^#]*(?i:\btopicid)=)[^#]*)?(?:#|$)' _TESTS = [{ 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/', 'info_dict': { @@ -145,7 +144,7 @@ class TuneInPodcastIE(TuneInBaseIE): class TuneInPodcastEpisodeIE(TuneInBaseIE): IE_NAME = 'tunein:podcast' - _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?\?(?:[^#]*)?(?<=\?|&)(?i:topicid)=(?P\d+)(?:[&#]|$)' + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?\?[^#]*?(?<=\?|&)(?i:\btopicid)=(?P\d+)(?:[&#]|$)' _TESTS = [{ 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354', 'info_dict': { @@ -183,9 +182,7 @@ class TuneInPodcastEpisodeIE(TuneInBaseIE): }] def _real_extract(self, url): - series_id = self._match_valid_url(url).group('series_id') - display_id = traverse_obj(parse_qs(url), ( - ('topicid', 'topicId'), -1, {str}, any, {require('topic ID')})) + series_id, display_id = self._match_valid_url(url).group('series_id', 'id') episode_id = f't{display_id}' formats, subtitles = self._extract_formats_and_subtitles(episode_id) From 9cc551973b7606469cea934871af45c07339911a Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sun, 17 Aug 2025 14:08:56 +0900 Subject: [PATCH 3/3] Apply suggestions --- yt_dlp/extractor/tunein.py | 71 ++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/yt_dlp/extractor/tunein.py b/yt_dlp/extractor/tunein.py index f6c37ddaf7..2f255a01ab 100644 --- a/yt_dlp/extractor/tunein.py +++ b/yt_dlp/extractor/tunein.py @@ -1,10 +1,13 @@ -import itertools +import functools import urllib.parse from .common import InfoExtractor from ..utils import ( + OnDemandPagedList, + UnsupportedError, clean_html, int_or_none, + join_nonempty, parse_iso8601, update_url_query, url_or_none, @@ -14,10 +17,9 @@ from ..utils.traversal import traverse_obj class TuneInBaseIE(InfoExtractor): def _call_api(self, item_id, endpoint=None, note='Downloading JSON metadata', query=None): - path = f'/{endpoint}' if endpoint else '' - return self._download_json( - f'https://api.tunein.com/profiles/{item_id}{path}', item_id, note=note, query=query) + join_nonempty('https://api.tunein.com/profiles', item_id, endpoint, delim='/'), + item_id, note=note, query=query) def _extract_formats_and_subtitles(self, content_id): streams = self._download_json( @@ -25,11 +27,10 @@ class TuneInBaseIE(InfoExtractor): 'formats': 'mp3,aac,ogg,flash,hls', 'id': content_id, 'render': 'json', - }, - )['body'] + }) formats, subtitles = [], {} - for stream in streams: + for stream in traverse_obj(streams, ('body', lambda _, v: url_or_none(v['url']))): if stream.get('media_type') == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False) formats.extend(fmts) @@ -38,7 +39,7 @@ class TuneInBaseIE(InfoExtractor): formats.append(traverse_obj(stream, { 'abr': ('bitrate', {int_or_none}), 'ext': ('media_type', {str}), - 'url': ('url', {self._proto_relative_url}, {url_or_none}), + 'url': ('url', {self._proto_relative_url}), })) return formats, subtitles @@ -46,7 +47,7 @@ class TuneInBaseIE(InfoExtractor): class TuneInStationIE(TuneInBaseIE): IE_NAME = 'tunein:station' - _VALID_URL = r'https?://tunein\.com/radio/[^/]+(?Ps\d+)' + _VALID_URL = r'https?://tunein\.com/radio/[^/?#]+(?Ps\d+)' _TESTS = [{ 'url': 'https://tunein.com/radio/Jazz24-885-s34682/', 'info_dict': { @@ -91,7 +92,7 @@ class TuneInStationIE(TuneInBaseIE): 'channel_follower_count': ('Actions', 'Follow', 'FollowerCount', {int_or_none}), 'description': ('Description', {clean_html}, filter), 'is_live': ('Actions', 'Play', 'IsLive', {bool}), - 'location': ('Properties', 'Location', 'DisplayName', {str}, any), + 'location': ('Properties', 'Location', 'DisplayName', {str}), 'thumbnail': ('Image', {url_or_none}), })), } @@ -100,7 +101,7 @@ class TuneInStationIE(TuneInBaseIE): class TuneInPodcastIE(TuneInBaseIE): IE_NAME = 'tunein:podcast:program' _PAGE_SIZE = 20 - _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?(?:\?(?![^#]*(?i:\btopicid)=)[^#]*)?(?:#|$)' + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+){1,2}(?Pp\d+)' _TESTS = [{ 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/', 'info_dict': { @@ -117,34 +118,36 @@ class TuneInPodcastIE(TuneInBaseIE): 'playlist_mincount': 35, }] - def _entries(self, podcast_id): - for page in itertools.count(): - contents = self._call_api( - podcast_id, 'contents', - f'Downloading page {page + 1}', query={ - 'filter': 't:free', - 'limit': self._PAGE_SIZE, - 'offset': page * self._PAGE_SIZE, - }) + @classmethod + def suitable(cls, url): + return False if TuneInPodcastEpisodeIE.suitable(url) else super().suitable(url) - yield from traverse_obj(contents, ( - 'Items', ..., 'GuideId', {str}, filter, all, filter)) + def _fetch_page(self, url, podcast_id, page=0): + items = self._call_api( + podcast_id, 'contents', f'Downloading page {page + 1}', query={ + 'filter': 't:free', + 'limit': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + }, + )['Items'] - if not traverse_obj(contents, ('Paging', 'Next', {url_or_none})): - break + if items: + for item in traverse_obj(items, ( + ..., 'GuideId', {str}, filter, all, filter, + )): + yield self.url_result(update_url_query(url, {'topicId': item[1:]})) def _real_extract(self, url): podcast_id = self._match_id(url) - return self.playlist_from_matches( - self._entries(podcast_id), podcast_id, - traverse_obj(self._call_api(podcast_id), ('Item', 'Title', {str})), - getter=lambda x: update_url_query(url, {'topicId': x[1:]})) + return self.playlist_result(OnDemandPagedList( + functools.partial(self._fetch_page, url, podcast_id), self._PAGE_SIZE), + podcast_id, traverse_obj(self._call_api(podcast_id), ('Item', 'Title', {str}))) class TuneInPodcastEpisodeIE(TuneInBaseIE): IE_NAME = 'tunein:podcast' - _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?\?[^#]*?(?<=\?|&)(?i:\btopicid)=(?P\d+)(?:[&#]|$)' + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+){1,2}(?Pp\d+)/?\?(?:[^#]+&)?(?i:topicid)=(?P\d+)' _TESTS = [{ 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354', 'info_dict': { @@ -210,7 +213,7 @@ class TuneInPodcastEpisodeIE(TuneInBaseIE): class TuneInEmbedIE(TuneInBaseIE): IE_NAME = 'tunein:embed' _VALID_URL = r'https?://tunein\.com/embed/player/(?P[^/?#]+)' - _EMBED_REGEX = [r']+src=["\'](?P(?:https?:)?//tunein\.com/embed/player/[^/?#]+)'] + _EMBED_REGEX = [r']+\bsrc=["\'](?P(?:https?:)?//tunein\.com/embed/player/[^/?#"\']+)'] _TESTS = [{ 'url': 'https://tunein.com/embed/player/s6404/', 'info_dict': { @@ -247,7 +250,7 @@ class TuneInEmbedIE(TuneInBaseIE): 'id': 'p191660', 'title': 'SBS Tamil', }, - 'playlist_mincount': 197, + 'playlist_mincount': 196, }] _WEBPAGE_TESTS = [{ 'url': 'https://www.martiniinthemorning.com/', @@ -328,5 +331,7 @@ class TuneInShortenerIE(InfoExtractor): urlh = self._request_webpage(url, redirect_id, 'Downloading redirect page') parsed = urllib.parse.urlparse(urlh.url) - return self.url_result( - urllib.parse.urlunparse(parsed._replace(netloc=parsed.hostname))) + new_url = urllib.parse.urlunparse(parsed._replace(netloc=parsed.hostname)) + if self.suitable(new_url): # Prevent infinite loop in case redirect fails + raise UnsupportedError(new_url) + return self.url_result(new_url)