From ab0eadedb62faf51ab3510be1bcf399dc4912eac Mon Sep 17 00:00:00 2001 From: gandbg <62039938+gandbg@users.noreply.github.com> Date: Wed, 30 Jul 2025 19:17:36 +0200 Subject: [PATCH 1/3] [ie/SpreakerShow] Centralize single episode extraction and improve metadata extraction --- yt_dlp/extractor/spreaker.py | 93 +++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/spreaker.py b/yt_dlp/extractor/spreaker.py index c64c2fcd2e..671610dabf 100644 --- a/yt_dlp/extractor/spreaker.py +++ b/yt_dlp/extractor/spreaker.py @@ -2,7 +2,6 @@ import itertools from .common import InfoExtractor from ..utils import ( - filter_dict, float_or_none, int_or_none, parse_qs, @@ -127,53 +126,89 @@ class SpreakerIE(InfoExtractor): class SpreakerShowIE(InfoExtractor): _VALID_URL = [ - r'https?://api\.spreaker\.com/show/(?P\d+)', + r'https?://api\.spreaker\.com/(?:v2/)shows?/(?P\d+)', r'https?://(?:www\.)?spreaker\.com/podcast/[\w-]+--(?P[\d]+)', r'https?://(?:www\.)?spreaker\.com/show/(?P\d+)/episodes/feed', ] _TESTS = [{ - 'url': 'https://api.spreaker.com/show/4652058', + 'url': 'https://api.spreaker.com/v2/shows/4652058', 'info_dict': { - 'id': '4652058', + 'id': 4652058, + 'display_id': '3-ninjas-podcast', + 'title': 'The Dojo w/ Domino & Hesh Jones', + 'description': 'md5:d3277d9d3264b85a56f34de37820af95', + 'uploader': 'The Dojo w/ Domino & Hesh Jone', + 'uploader_id': 13414919, + 'uploader_url': 'https://www.spreaker.com/user/the-dojo-w-domino-hesh-jone--13414919', + 'thumbnail': 'https://d3wo5wojvuv7l.cloudfront.net/images.spreaker.com/original/2808a2bb63a36549ca25b9a72492c70a.jpg', + 'categories': ['Comedy', 'Animation & Manga', 'Video Games'], }, 'playlist_mincount': 118, }, { 'url': 'https://www.spreaker.com/podcast/health-wealth--5918323', 'info_dict': { - 'id': '5918323', + 'id': 5918323, + 'display_id': 'itpodradio-health-wealth', + 'title': 'Health Wealth', + 'description': 'md5:99e7a46c0c39b7b9f5aee92452216864', + 'uploader': 'India Today Podcast', + 'uploader_id': 15714861, + 'uploader_url': 'https://www.spreaker.com/user/india-today-podcast--15714861', + 'thumbnail': 'https://d3wo5wojvuv7l.cloudfront.net/images.spreaker.com/original/cb96e6b9a211c1a004e4a027f696f8c2.jpg', + 'categories': ['Health & Fitness'], }, 'playlist_mincount': 60, }, { 'url': 'https://www.spreaker.com/show/5887186/episodes/feed', 'info_dict': { - 'id': '5887186', + 'id': 5887186, + 'display_id': 'orbinea', + 'title': 'Orbinéa Le Monde des Odyssées| Documentaire Podcast Histoire pour dormir Livre Audio Enfant & Adulte', + 'description': 'md5:79101727388ece4114ae4fabc8861bb5', + 'uploader': 'Orbinea Studio', + 'uploader_id': 17206155, + 'uploader_url': 'https://www.spreaker.com/user/orbinea-studio--17206155', + 'thumbnail': 'https://d3wo5wojvuv7l.cloudfront.net/images.spreaker.com/original/0d755be30d97fb65f8a8f2803a5edb57.jpg', + 'categories': ['Science', 'Documentary', 'Education'], }, 'playlist_mincount': 290, }] - def _entries(self, show_id, key=None): + def _real_extract(self, url): + show_id = self._match_id(url) + show_data = self._download_json( + f'https://api.spreaker.com/v2/shows/{show_id}', show_id, + note='Downloading JSON show metadata') + episodes = [] + episodes_api_url = f'https://api.spreaker.com/v2/shows/{show_id}/episodes?limit=100' + for page_num in itertools.count(1): - episodes = self._download_json( - f'https://api.spreaker.com/show/{show_id}/episodes', - show_id, note=f'Downloading JSON page {page_num}', query=filter_dict({ - 'page': page_num, - 'max_per_page': 100, - 'key': key, - })) - pager = try_get(episodes, lambda x: x['response']['pager'], dict) - if not pager: - break - results = pager.get('results') - if not results or not isinstance(results, list): - break - for result in results: - if not isinstance(result, dict): - continue - yield _extract_episode(result) - if page_num == pager.get('last_page'): + episodes_api = self._download_json(episodes_api_url, show_id, + note=f'Downloading JSON episodes metadata page {page_num}') + episodes_in_page = traverse_obj(episodes_api, ('response', 'items', ..., { + 'url': 'site_url', + 'id': 'episode_id', + 'title': 'title', + })) + + for i in episodes_in_page: + episodes.append(self.url_result(i['url'], ie=SpreakerIE.ie_key(), video_id=i.get('id'), video_title=i.get('title'))) + + episodes_api_url = traverse_obj(episodes_api, ('response', 'next_url'), default=None) + if episodes_api_url is None: break - def _real_extract(self, url): - show_id = self._match_id(url) - key = traverse_obj(parse_qs(url), ('key', 0)) - return self.playlist_result(self._entries(show_id, key), playlist_id=show_id) + return { + '_type': 'playlist', + 'id': int_or_none(show_id), + 'display_id': traverse_obj(show_data, ('response', 'show', 'permalink')), + 'title': traverse_obj(show_data, ('response', 'show', 'title')), + 'description': traverse_obj(show_data, ('response', 'show', 'description')), + 'thumbnail': traverse_obj(show_data, ('response', 'show', 'image_original_url')), + 'uploader': traverse_obj(show_data, ('response', 'show', 'author', 'fullname')), + 'uploader_id': traverse_obj(show_data, ('response', 'show', 'author', 'user_id')), + 'uploader_url': traverse_obj(show_data, ('response', 'show', 'author', 'site_url')), + 'webpage_url': traverse_obj(show_data, ('response', 'show', 'site_url')), + 'categories': traverse_obj(show_data, ('response', 'show', ('category', 'category_2', 'category_3'), 'name')), + 'entries': episodes, + } From 06b7a952f0b3ef442f34d6a4cdc05f6123c483e8 Mon Sep 17 00:00:00 2001 From: gandbg <62039938+gandbg@users.noreply.github.com> Date: Wed, 30 Jul 2025 23:57:28 +0200 Subject: [PATCH 2/3] [ie/SpreakerShow] Reimplement support for 'key' URL parameter --- yt_dlp/extractor/spreaker.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/spreaker.py b/yt_dlp/extractor/spreaker.py index 671610dabf..ffa4c58a95 100644 --- a/yt_dlp/extractor/spreaker.py +++ b/yt_dlp/extractor/spreaker.py @@ -8,6 +8,7 @@ from ..utils import ( str_or_none, try_get, unified_timestamp, + update_url_query, url_or_none, ) from ..utils.traversal import traverse_obj @@ -176,15 +177,18 @@ class SpreakerShowIE(InfoExtractor): def _real_extract(self, url): show_id = self._match_id(url) + additional_api_query = traverse_obj(parse_qs(url), { + 'key': ('key', 0), + }) or {} show_data = self._download_json( f'https://api.spreaker.com/v2/shows/{show_id}', show_id, - note='Downloading JSON show metadata') + note='Downloading JSON show metadata', query=additional_api_query) episodes = [] episodes_api_url = f'https://api.spreaker.com/v2/shows/{show_id}/episodes?limit=100' for page_num in itertools.count(1): episodes_api = self._download_json(episodes_api_url, show_id, - note=f'Downloading JSON episodes metadata page {page_num}') + note=f'Downloading JSON episodes metadata page {page_num}', query=additional_api_query) episodes_in_page = traverse_obj(episodes_api, ('response', 'items', ..., { 'url': 'site_url', 'id': 'episode_id', @@ -192,7 +196,7 @@ class SpreakerShowIE(InfoExtractor): })) for i in episodes_in_page: - episodes.append(self.url_result(i['url'], ie=SpreakerIE.ie_key(), video_id=i.get('id'), video_title=i.get('title'))) + episodes.append(self.url_result(update_url_query(i['url'], additional_api_query), ie=SpreakerIE.ie_key(), video_id=i.get('id'), video_title=i.get('title'))) episodes_api_url = traverse_obj(episodes_api, ('response', 'next_url'), default=None) if episodes_api_url is None: From 7413f79ccb7dbd7bf735f4d58c3cd41d0546e97e Mon Sep 17 00:00:00 2001 From: gandbg <62039938+gandbg@users.noreply.github.com> Date: Thu, 31 Jul 2025 21:01:01 +0200 Subject: [PATCH 3/3] [ie/SpreakerShow] Add support for login using OAuth token --- yt_dlp/extractor/spreaker.py | 48 ++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/spreaker.py b/yt_dlp/extractor/spreaker.py index ffa4c58a95..5dbca03280 100644 --- a/yt_dlp/extractor/spreaker.py +++ b/yt_dlp/extractor/spreaker.py @@ -2,6 +2,7 @@ import itertools from .common import InfoExtractor from ..utils import ( + ExtractorError, float_or_none, int_or_none, parse_qs, @@ -116,12 +117,29 @@ class SpreakerIE(InfoExtractor): 'url': 'https://www.spreaker.com/episode/60269615', 'only_matching': True, }] + _NETRC_MACHINE = 'spreaker' + additional_api_query = {} + + def _perform_login(self, username, password): + if username != 'oauth': + raise ExtractorError('Login using username and password is not currently supported. ' + 'Set your username to "oauth" and the password to an OAuth token to login using that. ' + f'{self._login_hint("password")}', expected=True) + + self._download_json(f'https://api.spreaker.com/v2/me?oauth2_access_token={password}', None, + note='Verifying OAuth token...', errnote='Provided OAuth token is not valid') + + # If the OAuth token is not valid, yt-dlp terminates before this line with exit code 1 + # so if we got here, it means that the token is valid + self.additional_api_query.update({'oauth2_access_token': password}) def _real_extract(self, url): episode_id = self._match_id(url) + self.additional_api_query.update(traverse_obj(parse_qs(url), {'key': ('key', 0)}) or {}) + data = self._download_json( f'https://api.spreaker.com/v2/episodes/{episode_id}', episode_id, - query=traverse_obj(parse_qs(url), {'key': ('key', 0)}))['response']['episode'] + query=self.additional_api_query)['response']['episode'] return _extract_episode(data, episode_id) @@ -174,21 +192,35 @@ class SpreakerShowIE(InfoExtractor): }, 'playlist_mincount': 290, }] + _NETRC_MACHINE = 'spreaker' + additional_api_query = {} + + def _perform_login(self, username, password): + if username != 'oauth': + raise ExtractorError('Login using username and password is not currently supported. ' + 'Set your username to "oauth" and the password to an OAuth token to login using that. ' + f'{self._login_hint("password")}', expected=True) + + self._download_json(f'https://api.spreaker.com/v2/me?oauth2_access_token={password}', None, + note='Verifying OAuth token...', errnote='Provided OAuth token is not valid') + + # If the OAuth token is not valid, yt-dlp terminates before this line with exit code 1 + # so if we got here, it means that the token is valid + self.additional_api_query.update({'oauth2_access_token': password}) def _real_extract(self, url): show_id = self._match_id(url) - additional_api_query = traverse_obj(parse_qs(url), { - 'key': ('key', 0), - }) or {} + self.additional_api_query.update(traverse_obj(parse_qs(url), {'key': ('key', 0)}) or {}) + show_data = self._download_json( f'https://api.spreaker.com/v2/shows/{show_id}', show_id, - note='Downloading JSON show metadata', query=additional_api_query) + note='Downloading JSON show metadata', query=self.additional_api_query) episodes = [] episodes_api_url = f'https://api.spreaker.com/v2/shows/{show_id}/episodes?limit=100' for page_num in itertools.count(1): episodes_api = self._download_json(episodes_api_url, show_id, - note=f'Downloading JSON episodes metadata page {page_num}', query=additional_api_query) + note=f'Downloading JSON episodes metadata page {page_num}', query=self.additional_api_query) episodes_in_page = traverse_obj(episodes_api, ('response', 'items', ..., { 'url': 'site_url', 'id': 'episode_id', @@ -196,7 +228,9 @@ class SpreakerShowIE(InfoExtractor): })) for i in episodes_in_page: - episodes.append(self.url_result(update_url_query(i['url'], additional_api_query), ie=SpreakerIE.ie_key(), video_id=i.get('id'), video_title=i.get('title'))) + episodes.append(self.url_result( + update_url_query(i['url'], traverse_obj(self.additional_api_query, {'key': ('key', 0)})), + ie=SpreakerIE.ie_key(), video_id=i.get('id'), video_title=i.get('title'))) episodes_api_url = traverse_obj(episodes_api, ('response', 'next_url'), default=None) if episodes_api_url is None: