feat(extractor/applepodcasts): add playlist support

pull/13864/head
Florent DELAHAYE 3 weeks ago
parent 23c658b9cb
commit ebea243ea2

@ -137,7 +137,10 @@ from .aol import AolIE
from .apa import APAIE from .apa import APAIE
from .aparat import AparatIE from .aparat import AparatIE
from .appleconnect import AppleConnectIE from .appleconnect import AppleConnectIE
from .applepodcasts import ApplePodcastsIE from .applepodcasts import (
ApplePodcastsIE,
ApplePodcastsPlaylistIE
)
from .appletrailers import ( from .appletrailers import (
AppleTrailersIE, AppleTrailersIE,
AppleTrailersSectionIE, AppleTrailersSectionIE,

@ -5,10 +5,16 @@ from ..utils import (
parse_iso8601, parse_iso8601,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import traverse_obj
import re
from urllib.parse import urljoin
class ApplePodcastsBaseIE(InfoExtractor):
_BASE_URL_REGEX = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}'
_BASE_HTML_JSON_LOCATION = r'<script [^>]*\bid=["\']serialized-server-data["\'][^>]*>'
_BASE_HTML_JSON_PATTERN = r'\[{(?s:.+)}\]'
class ApplePodcastsIE(InfoExtractor): class ApplePodcastsIE(ApplePodcastsBaseIE):
_VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)' _VALID_URL = ApplePodcastsBaseIE._BASE_URL_REGEX + r'.*?\bi=(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://podcasts.apple.com/us/podcast/ferreck-dawn-to-the-break-of-dawn-117/id1625658232?i=1000665010654', 'url': 'https://podcasts.apple.com/us/podcast/ferreck-dawn-to-the-break-of-dawn-117/id1625658232?i=1000665010654',
'md5': '82cc219b8cc1dcf8bfc5a5e99b23b172', 'md5': '82cc219b8cc1dcf8bfc5a5e99b23b172',
@ -55,8 +61,8 @@ class ApplePodcastsIE(InfoExtractor):
episode_id = self._match_id(url) episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id) webpage = self._download_webpage(url, episode_id)
server_data = self._search_json( server_data = self._search_json(
r'<script [^>]*\bid=["\']serialized-server-data["\'][^>]*>', webpage, ApplePodcastsBaseIE._BASE_HTML_JSON_LOCATION, webpage,
'server data', episode_id, contains_pattern=r'\[{(?s:.+)}\]')[0]['data'] 'server data', episode_id, contains_pattern=ApplePodcastsBaseIE._BASE_HTML_JSON_PATTERN)[0]['data']
model_data = traverse_obj(server_data, ( model_data = traverse_obj(server_data, (
'headerButtonItems', lambda _, v: v['$kind'] == 'bookmark' and v['modelType'] == 'EpisodeOffer', 'headerButtonItems', lambda _, v: v['$kind'] == 'bookmark' and v['modelType'] == 'EpisodeOffer',
'model', {dict}, any)) 'model', {dict}, any))
@ -75,3 +81,131 @@ class ApplePodcastsIE(InfoExtractor):
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'vcodec': 'none', 'vcodec': 'none',
} }
class ApplePodcastsPlaylistIE(ApplePodcastsBaseIE):
# Apple podcast items are partially described (last episodes only) in the embedded json from main page therefore API calls are mandatory to get a full list
_VALID_URL = ApplePodcastsBaseIE._BASE_URL_REGEX + r'/id(?P<id>\d+)'
_TESTS = [{
'url': 'https://podcasts.apple.com/fr/podcast/id1691740320',
'info_dict': {
'id': '1691740320',
'title': 'LEGEND',
'playlist_uploader': 'Guillaume Pley'
},
'playlist_mincount': 400,
'playlist_entries': [
{
'id': '1000718966711',
'title': 'MICHAEL YOUN : LES MOMENTS LES PLUS FOUS DE SES 25 ANS DE CARRIÈRE (MORNING LIVE, FATAL 2…)',
'uploader': 'Guillaume Pley',
'description': "Retrouvez la boutique LEGEND ➡️: https://shop.legend-group.fr/\nMerci à Michaël Youn d'être venu nous voir sur LEGEND. Il est venu nous raconter plus de 25 ans de carrière, en tant quacteur, réalisateur et artiste. Il a été révélé par l'émission Morning Live sur M6, il nous a livré les anecdotes les plus folles quil a vécues à cette époque. Il est aussi venu nous raconter comment il a rencontré sa femme et ce que ses enfants ont changé dans sa vie.\nPour voir la bande annonce du film « Certains laiment chauve » déjà disponible au cinéma ➡️ https://www.allocine.fr/film/fichefilm_gen_cfilm=1000007354.html\nRetrouvez l'interview complète sur YouTube ➡ https://youtu.be/_TXBz1dSfBw\nPour toutes demandes de partenariats : legend@influxcrew.com\nRetrouvez-nous sur tous les réseaux LEGEND !\nFacebook : https://www.facebook.com/legendmediafr\nInstagram : https://www.instagram.com/legendmedia/\nTikTok : https://www.tiktok.com/@legend\nTwitter : https://twitter.com/legendmediafr\nSnapchat : https://t.snapchat.com/CgEvsbWV\n Hébergé par Acast. Visitez acast.com/privacy pour plus d'informations.",
'release_timestamp': 1753434168,
'duration': 6856,
'url': 'https://podcasts.apple.com/fr/podcast/michael-youn-les-moments-les-plus-fous-de-ses-25-ans/id1691740320?i=1000718966711'
},
{
'id': '1000718672235',
'title': 'AMBULANCIER DU SAMU: SES INTERVENTIONS IMPROBABLES (SUIC*DES, FAUX MALADES, ENFANTS DR0GUÉS)',
'uploader': 'Guillaume Pley',
'description': "Retrouvez la boutique LEGEND ➡️: https://shop.legend-group.fr/\nMerci à Thomas dêtre passé nous voir chez LEGEND ! Thomas est ambulancier et urgentiste au SMUR depuis 10 ans. Il est venu partager avec nous ses anecdotes les plus marquantes.\nIl a vécu des interventions difficiles, comme sur une scène de crime où une mère avait tué ses deux enfants, ou encore ce jour où il a pris en charge une victime coupée en deux par un hachoir.\nMais son métier, cest aussi des moments plus légers, parfois même drôles, comme cette fois où il a dû intervenir sur le tournage dun film X pour secourir des acteurs.\nPour toutes demandes de partenariats : legend@influxcrew.com\nRetrouvez-nous sur tous les réseaux LEGEND !\nRetrouvez l'interview complète sur YouTube ➡ https://youtu.be/ye5cVoc7hIc\nFacebook : https://www.facebook.com/legendmediafr\nInstagram : https://www.instagram.com/legendmedia/\nTikTok : https://www.tiktok.com/@legend\nTwitter : https://twitter.com/legendmediafr\nSnapchat : https://t.snapchat.com/CgEvsbWV\n Hébergé par Acast. Visitez acast.com/privacy pour plus d'informations.",
'release_timestamp': 1753272000,
'duration': 4165,
'url': 'https://podcasts.apple.com/fr/podcast/ambulancier-du-samu-ses-interventions-improbables-suic/id1691740320?i=1000718672235'
}
]
}]
# Extract token (supposedly JWT) from javascript
# Note: javascript file number/names and token variable name may change
def _extract_token(self, webpage):
js_urls = re.findall(r'<script[^>]+src=["\'](/assets/[^"\']+\.js)["\']', webpage)
js_urls = [urljoin('https://podcasts.apple.com', u) for u in js_urls]
auth_token = None
for js_url in js_urls:
js_code = self._download_webpage(js_url, 'Generic authentification token', fatal=False, note=f'Scanning {js_url}')
if not js_code:
continue
match = re.search(r'const\s+Ml\s*=\s*"((?:eyJ)[^"]+)"', js_code)
if match:
auth_token = match.group(1)
break
if not auth_token:
raise ExtractorError('Generic authorization token not found in any JS files')
return auth_token
# Call backend API pages and merge them as a single list
def _paginate_episodes(self, playlist_id, token):
base_url = 'https://amp-api.podcasts.apple.com/v1/catalog/fr/podcasts/'
headers = {
'Authorization': f'Bearer {token}',
'Origin': 'https://podcasts.apple.com'
}
all_episodes = []
offset = 0
limit = 25 # Limit in use by website but other values seem to be accepted
while True:
episodes_url = (
f'{base_url}{playlist_id}/episodes?'
f'l=fr-FR&offset={offset}&limit={limit}'
)
episodes_json = self._download_json(episodes_url, playlist_id, headers=headers, note=f'Downloading episodes offset {offset}')
all_episodes.extend(episodes_json.get('data', []))
if 'next' not in episodes_json:
break
offset += limit
return all_episodes
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
server_data = self._search_json(
ApplePodcastsBaseIE._BASE_HTML_JSON_LOCATION, webpage,
'server data', playlist_id, contains_pattern=ApplePodcastsBaseIE._BASE_HTML_JSON_PATTERN)[0]['data']
playlist_data = traverse_obj(server_data,
(..., lambda _, v: v.get('contentType') == 'showHeaderRegular', 'items', 0),
expected_type=dict, get_all=False)
token = self._extract_token(webpage)
episodes = self._paginate_episodes(playlist_id, token)
entries = []
for e in episodes:
episode_data = traverse_obj(e, {
'id': ('id', {str}),
'title': ('attributes', 'name', {str}),
'uploader': ('attributes', 'artistName', {str}),
'description': ('attributes', 'description', 'standard', {str}),
'url': ('attributes', 'url', {clean_podcast_url}),
'release_timestamp': ('attributes', 'releaseDateTime', {parse_iso8601}),
'duration': ('attributes', 'durationInMilliseconds', {lambda x: int(x) // 1000}),
'thumbnail_template': ('artwork', 'url', {str}),
'thumb_width': ('artwork', 'width', {int}),
'thumb_height': ('artwork', 'height', {int}),
})
if not episode_data.get('url'):
continue
entries.append({
'_type': 'url',
'ie_key': 'ApplePodcasts',
**episode_data,
})
return self.playlist_result(entries,
playlist_id,
**traverse_obj(playlist_data, {
'playlist_title': ('title', {str}),
'playlist_description': ('description', {str}),
'playlist_uploader': ('providerTitle', {str}),
})
)

Loading…
Cancel
Save