From 728334a40e7a5e74fc050156288a04f26d88c1a6 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Mon, 24 Feb 2025 18:28:57 +0900 Subject: [PATCH] [ie/Voicy] Rework extractor --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/voicy.py | 462 ++++++++++++++++++++++++-------- 2 files changed, 348 insertions(+), 116 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 403e1f1f65..9dafc16a08 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2389,6 +2389,8 @@ from .vodplatform import VODPlatformIE from .voicy import ( VoicyChannelIE, VoicyIE, + VoicyLiveIE, + VoicyTopicIE, ) from .volejtv import VolejTVIE from .voxmedia import ( diff --git a/yt_dlp/extractor/voicy.py b/yt_dlp/extractor/voicy.py index f83c3f9411..29cbd8490f 100644 --- a/yt_dlp/extractor/voicy.py +++ b/yt_dlp/extractor/voicy.py @@ -1,145 +1,375 @@ +import calendar import itertools +import time +import urllib.parse -from .common import InfoExtractor +from .wrestleuniverse import WrestleUniverseBaseIE from ..utils import ( ExtractorError, - smuggle_url, + float_or_none, + merge_dicts, + parse_iso8601, + parse_qs, str_or_none, - traverse_obj, - unified_strdate, - unsmuggle_url, + update_url, + url_or_none, ) +from ..utils.traversal import traverse_obj -class VoicyBaseIE(InfoExtractor): - def _extract_from_playlist_data(self, value): - voice_id = str(value.get('PlaylistId')) - upload_date = unified_strdate(value.get('Published'), False) - items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']] - return { - '_type': 'multi_video', - 'entries': items, - 'id': voice_id, - 'title': str(value.get('PlaylistName')), - 'uploader': value.get('SpeakerName'), - 'uploader_id': str_or_none(value.get('SpeakerId')), - 'channel': value.get('ChannelName'), - 'channel_id': str_or_none(value.get('ChannelId')), - 'upload_date': upload_date, - } +class VoicyBaseIE(WrestleUniverseBaseIE): + _LOGIN_HEADERS = { + 'Content-Type': 'application/json', + 'X-Client-Version': 'Chrome/JsCore/10.13.2/FirebaseCore-web', + 'X-Firebase-Gmpid': '1:212371279501:web:318567ddcbb953adcc5cc4', + } + _LOGIN_HINT = ( + 'Use --username refresh --password , --username and --password, ' + '--netrc-cmd, or --netrc (voicy) to provide account credentials') + _LOGIN_QUERY = {'key': 'AIzaSyC5Rg-sxiYu6ySD8V-f6Eljwll8gHvgUK4'} + _NETRC_MACHINE = 'voicy' - def _extract_single_article(self, entry): - formats = [{ - 'url': entry['VoiceHlsFile'], - 'format_id': 'hls', - 'ext': 'm4a', - 'acodec': 'aac', - 'vcodec': 'none', - 'protocol': 'm3u8_native', - }, { - 'url': entry['VoiceFile'], - 'format_id': 'mp3', - 'ext': 'mp3', - 'acodec': 'mp3', - 'vcodec': 'none', - }] - return { - 'id': str(entry.get('ArticleId')), - 'title': entry.get('ArticleTitle'), - 'description': entry.get('MediaName'), - 'formats': formats, - } + @WrestleUniverseBaseIE._TOKEN.getter + def _TOKEN(self): + if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()): + if not self._REFRESH_TOKEN: + self.raise_login_required( + f'No refreshToken provided. {self._LOGIN_HINT}', method=None) + self._refresh_token() + return self._REAL_TOKEN + + def _perform_login(self, username, password): + if username.lower() == 'refresh': + self._REFRESH_TOKEN = password + return self._refresh_token() + return super()._perform_login(username, password) - def _call_api(self, url, video_id, **kwargs): - response = self._download_json(url, video_id, **kwargs) - if response.get('Status') != 0: - message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=str) - if not message: - message = 'There was a error in the response: %d' % response.get('Status') - raise ExtractorError(message, expected=False) - return response.get('Value') + def _call_api(self, path, some_id, note='Downloading JSON metadata', headers=None, query=None, fatal=True): + return self._download_json( + f'https://vmedia-player-api.voicy.jp/v1/{path}', some_id, note=note, headers={ + 'Authorization': f'Bearer {self._TOKEN}', + } | (headers or {}), query=query, fatal=fatal, + ) class VoicyIE(VoicyBaseIE): - _WORKING = False IE_NAME = 'voicy' - _VALID_URL = r'https?://voicy\.jp/channel/(?P\d+)/(?P\d+)' - ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s' + IE_DESC = 'Voicy' + + _VALID_URL = [ + r'https?://(?:www\.)?voicy\.jp/channel/(?P\d+)/(?P\d+)', + r'http://r\.voicy\.jp/\w+', + ] _TESTS = [{ - 'url': 'https://voicy.jp/channel/1253/122754', + 'url': 'https://voicy.jp/channel/3402/6361249', 'info_dict': { - 'id': '122754', - 'title': '1/21(木)声日記:ついに原稿終わった!!', - 'uploader': 'ちょまど@ ITエンジニアなオタク', - 'uploader_id': '7339', + 'id': '8576738', + 'ext': 'm4a', + 'title': '2025.1.19「ブラジル沖の白石康次郎さん&息子の成人式を『1年』間違えたお母さん」', + 'categories': ['トーク', '声優・アナウンサー'], + 'channel': '安住紳一郎の日曜天国', + 'channel_id': '3402', + 'comment_count': int, + 'description': 'md5:f39bb238ff7661c3b7e8934f8578cf33', + 'display_id': '6361249', + 'duration': 1588.741, + 'like_count': int, + 'release_date': '20250119', + 'release_timestamp': 1737272164, + 'series': '2025.1.19「ブラジル沖の白石康次郎さん&息子の成人式を『1年』間違えたお母さん」', + 'series_id': '6361249', + 'thumbnail': r're:https://files\.voicy\.jp/img/speaker/.+$', + 'uploader': 'TBS RADIO', + 'uploader_id': '17328', + 'view_count': int, }, - 'playlist_mincount': 9, + }, { + 'url': 'https://voicy.jp/channel/3272/1141448', + 'info_dict': { + 'id': '2757390', + 'ext': 'mp3', + 'title': '5/3 お久しぶり雑談回', + 'categories': ['トーク', '声優・アナウンサー'], + 'channel': '松嵜麗のボイログ!', + 'channel_id': '3272', + 'comment_count': int, + 'description': 'md5:4dee911d23cf1eedeb49687881878119', + 'display_id': '1141448', + 'duration': 433.24, + 'like_count': int, + 'release_date': '20240502', + 'release_timestamp': 1714662728, + 'series': '最近のわたし', + 'series_id': '1141448', + 'tags': ['最近のマイブーム'], + 'thumbnail': r're:https://files\.voicy\.jp/img/speaker/.+$', + 'uploader': '声優・松嵜麗', + 'uploader_id': '16462', + 'view_count': int, + }, + }, { + 'url': 'https://voicy.jp/channel/1417/6436213', + 'info_dict': { + 'id': '6436213', + 'title': '第100回 グリム兄弟「麦のほ」', + }, + 'playlist_count': 5, + 'skip': 'Only available for premium supporters', + }, { + 'url': 'http://r.voicy.jp/7Qm2JbexmY6', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - assert mobj - voice_id = mobj.group('id') - channel_id = mobj.group('channel_id') - url, article_list = unsmuggle_url(url) - if not article_list: - article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id) - return self._extract_from_playlist_data(article_list) - - -class VoicyChannelIE(VoicyBaseIE): - _WORKING = False + if not url.startswith('https'): + return self.url_result(update_url(url, scheme='https')) + + channel_id, audio_id = self._match_valid_url(url).groups() + channel_info = { + 'channel_id': channel_id, + **traverse_obj(self._call_api(f'channel/{channel_id}', channel_id), { + 'categories': ('category', ('name', ('subcategory', 'name'), {str})), + 'channel': ('name', {str}), + 'thumbnail': ('image', {url_or_none}), + 'uploader': ('personality', 'name', {str}), + 'uploader_id': ('personality', 'id', {str_or_none}), + }), + } + + audio_info = self._call_api(f'channels/{channel_id}/stories/{audio_id}', audio_id) + common_info = { + 'description': self._call_api( + f'channels/{channel_id}/stories/{audio_id}/description', audio_id, fatal=False, + ).get('description'), + 'display_id': audio_id, + 'series': audio_info['name'], + 'series_id': audio_id, + **traverse_obj(audio_info, { + 'comment_count': ('comment_count', {int}), + 'like_count': ('like_count', {int}), + 'release_timestamp': ('published', {parse_iso8601}), + 'tags': ('hashtags', ..., 'name', {str}), + }), + } + + all_entries = traverse_obj(audio_info, ('chapters', ..., { + 'id': ('id', {str_or_none}), + 'title': ('name', {str}), + 'duration': ('voice', 'duration', {float_or_none(scale=1000)}), + 'manifest': ('voice', 'file', {url_or_none}), + 'view_count': ('play_count', {int}), + })) + + entries = [] + for entry in all_entries: + if manifest := entry.pop('manifest', None): + ext = 'm4a' if 'audio_hls_aac' in manifest else 'mp3' + merged = merge_dicts(entry, channel_info, common_info, { + 'formats': self._extract_m3u8_formats(manifest, audio_id, ext), + }) + if len(all_entries) == 1: + return merged + entries.append(merged) + if not entries: + self.raise_login_required( + f'Premium(VIP) authentication required. {self._LOGIN_HINT}', method=None) + + return self.playlist_result(entries, audio_id, audio_info['name']) + + +class VoicyLiveIE(VoicyBaseIE): + IE_NAME = 'voicy:live' + + _VALID_URL = r'https?://(?:www\.)?voicy\.jp/channel/\d+/live/(?P\d+)' + _TESTS = [{ + 'url': 'https://voicy.jp/channel/1417/live/4858078', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + live_info = self._call_api( + f'live/{audio_id}', audio_id, headers={'X-Platform': '3'}) + if live_info['status'] != 'ended': + raise ExtractorError('WebRTC is not currently supported', expected=True) + if share_url := traverse_obj(live_info, ('archive', 'share_url', {url_or_none})): + return self.url_result(share_url, VoicyIE) + self.raise_no_formats( + 'This livestream has ended and no archive is available', expected=True) + + +class VoicyPlaylistBaseIE(VoicyBaseIE): + def _entries(self, path, some_id, query, keys, ie=VoicyIE): + pagination = '' + + for page in itertools.count(1): + info = self._call_api( + path, some_id, f'Downloading page {page}', + query={ + 'page_size': '100', + 'page_token': pagination, + } | (query or {}), + ) + yield from (self.url_result(s, ie) for s in traverse_obj(info, (*keys, 'share_url', {url_or_none}))) + + if not (pagination := traverse_obj(info, ('pagination', 'next_page_token', {str}))): + break + self._sleep(1, some_id) + + +class VoicyChannelIE(VoicyPlaylistBaseIE): IE_NAME = 'voicy:channel' - _VALID_URL = r'https?://voicy\.jp/channel/(?P\d+)' - PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s' + + _VALID_URL = r'https?://(?:www\.)?voicy\.jp/channel/(?P\d+)(?:/(?Pall|backnumber/\d+|premium))?(?:\?|$)' _TESTS = [{ - 'url': 'https://voicy.jp/channel/1253/', + 'url': 'https://voicy.jp/channel/3402', + 'info_dict': { + 'id': '3402', + 'title': '安住紳一郎の日曜天国', + }, + 'playlist_mincount': 107, + }, { + 'url': 'https://voicy.jp/channel/1/premium', + 'info_dict': { + 'id': '1', + 'title': 'Voicy社長の頭の中', + }, + 'playlist_mincount': 145, + }, { + 'url': 'https://voicy.jp/channel/2856/all', + 'info_dict': { + 'id': '2856', + 'title': 'そんなこんなで、茅原実里です', + }, + 'playlist_mincount': 62, + }, { + 'url': 'https://voicy.jp/channel/3321/all?type=all&month=202412', 'info_dict': { - 'id': '7339', - 'title': 'ゆるふわ日常ラジオ #ちょまラジ', - 'uploader': 'ちょまど@ ITエンジニアなオタク', - 'uploader_id': '7339', + 'id': '3321', + 'title': '海外安全チャンネル・りょーあん', }, - 'playlist_mincount': 54, + 'playlist_count': 4, + }, { + 'url': 'https://voicy.jp/channel/1417/backnumber/202501', + 'info_dict': { + 'id': '1417', + 'title': '繪ほんの中には 公式チャンネル', + }, + 'playlist_count': 5, }] - @classmethod - def suitable(cls, url): - return not VoicyIE.suitable(url) and super().suitable(url) + def _real_extract(self, url): + channel_id, _type = self._match_valid_url(url).groups() + channel_info = self._call_api(f'channel/{channel_id}', channel_id) + query = { + 'filter_type' if k == 'type' else k: v[0] + for k, v in parse_qs(url).items() if v + } | {'channel_view_id': channel_id, 'order': 'new'} - def _entries(self, channel_id): - pager = '' - for count in itertools.count(1): - article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note=f'Paging #{count}') - playlist_data = article_list.get('PlaylistData') - if not playlist_data: - break - yield from playlist_data - last = playlist_data[-1] - pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount']) + if _type == 'premium': + query['filter_type'] = 'premium' + elif (ym := query.pop('month', None) or (_type not in ('all', None) and _type.split('/')[-1])): + y, m = map(int, (ym[:4], ym[4:])) + d = calendar.monthrange(y, m)[1] + query.update({ + 'from': f'{y}-{m:02d}-01T00:00:00+09:00', + 'to': f'{y}-{m:02d}-{d}T23:59:59+09:00', + }) + + return self.playlist_result(self._entries( + 'stories', channel_id, query, ('stories', ...), + ), channel_id, channel_info['name']) + + +class VoicyTopicIE(VoicyPlaylistBaseIE): + IE_NAME = 'voicy:topic' + + _VALID_URL = r'https?://(?:www\.)?voicy\.jp/(?P{})(?:/(?P[\w%]+))?(?:/(?P[\w-]+))?'.format( + '|'.join(('audiobook', 'category', 'follow(?:ing-paystory)?', 'hashtag', 'paystory', 'pickup', 'search', 'voicedrama'))) + _TESTS = [{ + 'url': 'https://voicy.jp/audiobook', + 'info_dict': { + 'id': '111', + 'title': 'audiobook', + }, + 'playlist_mincount': 96, + }, { + 'url': 'https://voicy.jp/category/talk/voiceactor-announcer', + 'info_dict': { + 'id': 'voiceactor-announcer', + 'title': '声優・アナウンサー', + }, + 'playlist_mincount': 70, + }, { + 'url': 'https://voicy.jp/category/sports/all', + 'info_dict': { + 'id': 'sports', + 'title': 'スポーツ', + }, + 'playlist_mincount': 174, + }, { + 'url': 'https://voicy.jp/hashtag/%E3%82%B9%E3%83%9E%E3%83%BC%E3%83%88%E5%AE%B6%E9%9B%BB', + 'info_dict': { + 'id': 'hashtag', + 'title': 'スマート家電', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://voicy.jp/search/%E6%81%B5%E6%96%B9%E5%B7%BB%E3%81%8D', + 'info_dict': { + 'id': 'search', + 'title': '恵方巻き', + }, + 'playlist_mincount': 102, + }, { + 'url': 'https://voicy.jp/follow', + 'only_matching': True, + }] def _real_extract(self, url): - channel_id = self._match_id(url) - articles = self._entries(channel_id) - - first_article = next(articles, None) - title = traverse_obj(first_article, ('ChannelName', ), expected_type=str) - speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=str) - if not title and speaker_name: - title = f'Uploads from {speaker_name}' - if not title: - title = f'Uploads from channel ID {channel_id}' - - articles = itertools.chain([first_article], articles) if first_article else articles - - playlist = ( - self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key()) - for value in articles) - return { - '_type': 'playlist', - 'entries': playlist, - 'id': channel_id, - 'title': title, - 'channel': speaker_name, - 'channel_id': channel_id, - } + topic = self._match_id(url) + topic_id, is_story, ie = { + 'audiobook': ('111', False, VoicyChannelIE), + 'category': ('category', False, VoicyChannelIE), + 'follow': ('1', True, VoicyIE), + 'following-paystory': ('94', True, VoicyIE), + 'hashtag': ('hashtag', True, VoicyIE), + 'paystory': ('70', True, VoicyIE), + 'pickup': ('21', False, VoicyChannelIE), + 'search': ('search', True, VoicyIE), + 'voicedrama': ('93', False, VoicyChannelIE), + }[topic] + + keys = ('channels', ..., *('story',) * is_story) + if topic == 'category': + ctg, sub_ctg = self._match_valid_url(url).group('ctg_or_kwd', 'sub_ctg') + category = topic_id = sub_ctg if (has_sub := sub_ctg != 'all') else ctg + category_id, topic = traverse_obj(self._call_api('channel/categories', None), ( + *((..., 'subcategories') if has_sub else ()), + lambda _, v: v['view_id'] == category, ('id', 'name'), {str_or_none}, + )) + path = f'channel/categories/{category_id}' + query = {'exclude_story': 'true'} + elif topic in ('hashtag', 'search'): + keyword = self._match_valid_url(urllib.parse.unquote(url)).group('ctg_or_kwd') + if not keyword: + raise ExtractorError('Invalid URL', expected=True) + path = 'search/channels/story' + query = { + 'search_type': { + 'hashtag': 'hashtag', + 'search': 'words', + }[topic], + 'words': keyword, + } + topic = keyword + elif topic_id == '1': + path = 'user/me/channels/story/following' + query = {'series_filter': '2'} + else: + path = f'topics/channels{"/story" * is_story}' + query = {'topic_id': topic_id} + keys = (..., *keys) + + return self.playlist_result(self._entries(path, topic_id, query, keys, ie), topic_id, topic)