import re from .common import InfoExtractor from ..utils import ( int_or_none, join_nonempty, js_to_json, parse_duration, strftime_or_none, traverse_obj, unified_strdate, urljoin, ) class RadioFranceIE(InfoExtractor): _VALID_URL = r'https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' IE_NAME = 'radiofrance' def _real_extract(self, url): m = self._match_valid_url(url) video_id = m.group('id') webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'

(.*?)

', webpage, 'title') description = self._html_search_regex( r'
(.*?)
', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'
  © (.*?)
', webpage, 'uploader', fatal=False) formats_str = self._html_search_regex( r'class="jp-jplayer[^"]*" data-source="([^"]+)">', webpage, 'audio URLs') formats = [ { 'format_id': fm[0], 'url': fm[1], 'vcodec': 'none', 'quality': i, } for i, fm in enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) ] return { 'id': video_id, 'title': title, 'formats': formats, 'description': description, 'uploader': uploader, } class RadioFranceBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr' _STATIONS_RE = '|'.join(map(re.escape, ( 'franceculture', 'franceinfo', 'franceinter', 'francemusique', 'fip', 'mouv', ))) def _extract_data_from_webpage(self, webpage, display_id, key): return traverse_obj(self._search_json( r'\bconst\s+data\s*=', webpage, key, display_id, contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json), (..., 'data', key, {dict}), get_all=False) or {} class FranceCultureIE(RadioFranceBaseIE): _VALID_URL = rf'''(?x) {RadioFranceBaseIE._VALID_URL_BASE} /(?:{RadioFranceBaseIE._STATIONS_RE}) /podcasts/(?:[^?#]+/)?(?P[^?#]+)-(?P\d{{6,}})(?:$|[?#]) ''' _TESTS = [ { 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', 'info_dict': { 'id': '8440487', 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau', 'ext': 'mp3', 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'upload_date': '20220514', 'duration': 2750, }, }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675', 'info_dict': { 'id': '2107675', 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023', 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot', 'description': 'md5:36ee74351ede77a314fdebb94026b916', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'upload_date': '20230310', 'duration': 8977, 'ext': 'mp3', }, }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', 'only_matching': True, }, { 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200', 'only_matching': True, }, ] def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') webpage = self._download_webpage(url, display_id) # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846 video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}') return { 'id': video_id, 'display_id': display_id, 'url': video_data['contentUrl'], 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, 'duration': parse_duration(video_data.get('duration')), 'title': self._html_search_regex(r'(?s)]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)', webpage, 'title', default=self._og_search_title(webpage)), 'description': self._html_search_regex( r'(?s)(.*?)', webpage, 'uploader', default=None), 'upload_date': unified_strdate(self._search_regex( r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)), } class RadioFranceLiveIE(RadioFranceBaseIE): _VALID_URL = rf'''(?x) https?://(?:www\.)?radiofrance\.fr /(?P{RadioFranceBaseIE._STATIONS_RE}) /?(?Pradio-[\w-]+)?(?:[#?]|$) ''' _TESTS = [{ 'url': 'https://www.radiofrance.fr/franceinter/', 'info_dict': { 'id': 'franceinter', 'title': str, 'live_status': 'is_live', 'ext': 'aac', }, 'params': { 'skip_download': 'Livestream', }, }, { 'url': 'https://www.radiofrance.fr/franceculture', 'info_dict': { 'id': 'franceculture', 'title': str, 'live_status': 'is_live', 'ext': 'aac', }, 'params': { 'skip_download': 'Livestream', }, }, { 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family', 'info_dict': { 'id': 'mouv-radio-musique-kids-family', 'title': str, 'live_status': 'is_live', 'ext': 'aac', }, 'params': { 'skip_download': 'Livestream', }, }, { 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul', 'info_dict': { 'id': 'mouv-radio-rnb-soul', 'title': str, 'live_status': 'is_live', 'ext': 'aac', }, 'params': { 'skip_download': 'Livestream', }, }, { 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix', 'info_dict': { 'id': 'mouv-radio-musique-mix', 'title': str, 'live_status': 'is_live', 'ext': 'aac', }, 'params': { 'skip_download': 'Livestream', }, }, { 'url': 'https://www.radiofrance.fr/fip/radio-rock', 'info_dict': { 'id': 'fip-radio-rock', 'title': str, 'live_status': 'is_live', 'ext': 'aac', }, 'params': { 'skip_download': 'Livestream', }, }, { 'url': 'https://www.radiofrance.fr/mouv', 'only_matching': True, }] def _real_extract(self, url): station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id') if substation_id: webpage = self._download_webpage(url, station_id) api_response = self._search_json(r'webradioLive:\s*', webpage, station_id, substation_id, transform_source=js_to_json) else: api_response = self._download_json( f'https://www.radiofrance.fr/{station_id}/api/live', station_id) formats, subtitles = [], {} for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])): if media_source.get('format') == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'url': media_source['url'], 'abr': media_source.get('bitrate'), }) return { 'id': join_nonempty(station_id, substation_id), 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty( ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '), 'formats': formats, 'subtitles': subtitles, 'is_live': True, } class RadioFrancePlaylistBaseIE(RadioFranceBaseIE): """Subclasses must set _METADATA_KEY""" def _call_api(self, station, content_id, cursor): raise NotImplementedError('This method must be implemented by subclasses') def _generate_playlist_entries(self, station, content_id, content_response): while True: for entry in content_response['items']: if entry['link'] == '': yield entry else: yield self.url_result( f'https://www.radiofrance.fr{entry["link"]}', url_transparent=True, **traverse_obj(entry, { 'title': 'title', 'description': 'standFirst', 'timestamp': ('publishedDate', {int_or_none}), 'thumbnail': ('visual', 'src'), })) if content_response['next']: content_response = self._call_api(station, content_id, content_response['next']) else: break def _extract_embedded_episodes(self, item, webpage, content_id): """Certain episdoes data are embedded directly in the page, use these if the link is missing""" links = item['playerInfo']['media']['sources'] item['formats'] = [] for linkkey in links: url = self._search_regex(linkkey+r'\.url="([^"]+)";', webpage, content_id) dur = int(self._search_regex(linkkey+r'\.duration=(\d+);', webpage, content_id)) preset = self._search_json(linkkey+r'\.preset=', webpage, content_id, content_id, contains_pattern=r'\{.+\}', transform_source=js_to_json) item['formats'].append({ 'format_id': preset['id'], 'url': url, 'vcodec': 'none', 'acodec': preset['encoding'], 'quality': preset['bitrate'], 'duration': dur }) item['duration'] = dur return item def _real_extract(self, url): playlist_id = self._match_id(url) # If it is a podcast playlist, get the name of the station it is on # profile page playlists are not attached to a station currently station = self._match_valid_url(url).group('station') if isinstance(self, RadioFrancePodcastIE) else None # Get data for the first page, and the uuid for the playlist metadata = self._call_api(station, playlist_id, 1) uuid = traverse_obj(metadata, ('metadata', 'id')) return self.playlist_result( self._generate_playlist_entries(station, playlist_id, metadata), uuid, display_id=playlist_id, **{**traverse_obj(metadata['metadata'], { 'title': 'title', 'description': 'standFirst', 'thumbnail': ('visual', 'src'), }), **traverse_obj(metadata['metadata'], { 'title': 'name', 'description': 'role', })}) class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE): _VALID_URL = rf'''(?x) {RadioFranceBaseIE._VALID_URL_BASE} /(?P{RadioFranceBaseIE._STATIONS_RE}) /podcasts/(?P[\w-]+)/?(?:[?#]|$) ''' _TESTS = [{ 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert', 'info_dict': { 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17', 'display_id': 'le-billet-vert', 'title': 'Le billet sciences', 'description': 'md5:85d5ce8c488192e71904c551d595f4da', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', }, 'playlist_mincount': 11, }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/avec-la-langue', 'info_dict': { 'id': '53a95989-7c61-48c7-873c-6a71009101bb', 'display_id': 'avec-la-langue', 'title': 'Avec la langue', 'description': 'md5:4ddb6d4ed46dbbdee611b8e16e4af868', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', }, 'playlist_mincount': 36, }, { 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine', 'info_dict': { 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d', 'display_id': 'serie-thomas-grjebine', 'title': 'Thomas Grjebine', }, 'playlist_count': 1, }, { 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip', 'info_dict': { 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e', 'display_id': 'certains-l-aiment-fip', 'title': 'Certains l’aiment Fip', 'description': 'md5:7c373cdcec7a024f12fa34de7612e44e', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', }, 'playlist_mincount': 321, }, { 'url': 'http://www.radiofrance.fr/franceculture/podcasts/serie-les-aventures-de-tintin-les-cigares-du-pharaon', 'info_dict': { 'id': '01b096c6-e7f8-49c4-8319-dd399221885b', 'display_id': 'serie-les-aventures-de-tintin-les-cigares-du-pharaon', 'title': 'Les Cigares du Pharaon\xa0: les Aventures de Tintin', 'description': 'md5:1c5b6d010b2aaeb0d90b2c233b5f7b15', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', }, 'playlist_count': 5 }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9', 'only_matching': True, }, { 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix', 'only_matching': True, }] _METADATA_KEY = 'expressions' def _call_api(self, station, podcast_id, cursor): # The data is stored in the last