You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yt-dlp/yt_dlp/extractor/voicy.py

376 lines
14 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import calendar
import itertools
import time
import urllib.parse
from .wrestleuniverse import WrestleUniverseBaseIE
from ..utils import (
ExtractorError,
float_or_none,
merge_dicts,
parse_iso8601,
parse_qs,
str_or_none,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class VoicyBaseIE(WrestleUniverseBaseIE):
_LOGIN_HEADERS = {
'Content-Type': 'application/json',
'X-Client-Version': 'Chrome/JsCore/10.13.2/FirebaseCore-web',
'X-Firebase-Gmpid': '1:212371279501:web:318567ddcbb953adcc5cc4',
}
_LOGIN_HINT = (
'Use --username refresh --password <refreshToken>, --username and --password, '
'--netrc-cmd, or --netrc (voicy) to provide account credentials')
_LOGIN_QUERY = {'key': 'AIzaSyC5Rg-sxiYu6ySD8V-f6Eljwll8gHvgUK4'}
_NETRC_MACHINE = 'voicy'
@WrestleUniverseBaseIE._TOKEN.getter
def _TOKEN(self):
if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()):
if not self._REFRESH_TOKEN:
self.raise_login_required(
f'No refreshToken provided. {self._LOGIN_HINT}', method=None)
self._refresh_token()
return self._REAL_TOKEN
def _perform_login(self, username, password):
if username.lower() == 'refresh':
self._REFRESH_TOKEN = password
return self._refresh_token()
return super()._perform_login(username, password)
def _call_api(self, path, some_id, note='Downloading JSON metadata', headers=None, query=None, fatal=True):
return self._download_json(
f'https://vmedia-player-api.voicy.jp/v1/{path}', some_id, note=note, headers={
'Authorization': f'Bearer {self._TOKEN}',
} | (headers or {}), query=query, fatal=fatal,
)
class VoicyIE(VoicyBaseIE):
IE_NAME = 'voicy'
IE_DESC = 'Voicy'
_VALID_URL = [
r'https?://(?:www\.)?voicy\.jp/channel/(?P<channel>\d+)/(?P<id>\d+)',
r'http://r\.voicy\.jp/\w+',
]
_TESTS = [{
'url': 'https://voicy.jp/channel/3402/6361249',
'info_dict': {
'id': '8576738',
'ext': 'm4a',
'title': '2025.1.19「ブラジル沖の白石康次郎さん息子の成人式を『1年』間違えたお母さん」',
'categories': ['トーク', '声優・アナウンサー'],
'channel': '安住紳一郎の日曜天国',
'channel_id': '3402',
'comment_count': int,
'description': 'md5:f39bb238ff7661c3b7e8934f8578cf33',
'display_id': '6361249',
'duration': 1588.741,
'like_count': int,
'release_date': '20250119',
'release_timestamp': 1737272164,
'series': '2025.1.19「ブラジル沖の白石康次郎さん息子の成人式を『1年』間違えたお母さん」',
'series_id': '6361249',
'thumbnail': r're:https://files\.voicy\.jp/img/speaker/.+$',
'uploader': 'TBS RADIO',
'uploader_id': '17328',
'view_count': int,
},
}, {
'url': 'https://voicy.jp/channel/3272/1141448',
'info_dict': {
'id': '2757390',
'ext': 'mp3',
'title': '5/3 お久しぶり雑談回',
'categories': ['トーク', '声優・アナウンサー'],
'channel': '松嵜麗のボイログ!',
'channel_id': '3272',
'comment_count': int,
'description': 'md5:4dee911d23cf1eedeb49687881878119',
'display_id': '1141448',
'duration': 433.24,
'like_count': int,
'release_date': '20240502',
'release_timestamp': 1714662728,
'series': '最近のわたし',
'series_id': '1141448',
'tags': ['最近のマイブーム'],
'thumbnail': r're:https://files\.voicy\.jp/img/speaker/.+$',
'uploader': '声優・松嵜麗',
'uploader_id': '16462',
'view_count': int,
},
}, {
'url': 'https://voicy.jp/channel/1417/6436213',
'info_dict': {
'id': '6436213',
'title': '第100回 グリム兄弟「麦のほ」',
},
'playlist_count': 5,
'skip': 'Only available for premium supporters',
}, {
'url': 'http://r.voicy.jp/7Qm2JbexmY6',
'only_matching': True,
}]
def _real_extract(self, url):
if not url.startswith('https'):
return self.url_result(update_url(url, scheme='https'))
channel_id, audio_id = self._match_valid_url(url).groups()
channel_info = {
'channel_id': channel_id,
**traverse_obj(self._call_api(f'channel/{channel_id}', channel_id), {
'categories': ('category', ('name', ('subcategory', 'name'), {str})),
'channel': ('name', {str}),
'thumbnail': ('image', {url_or_none}),
'uploader': ('personality', 'name', {str}),
'uploader_id': ('personality', 'id', {str_or_none}),
}),
}
audio_info = self._call_api(f'channels/{channel_id}/stories/{audio_id}', audio_id)
common_info = {
'description': self._call_api(
f'channels/{channel_id}/stories/{audio_id}/description', audio_id, fatal=False,
).get('description'),
'display_id': audio_id,
'series': audio_info['name'],
'series_id': audio_id,
**traverse_obj(audio_info, {
'comment_count': ('comment_count', {int}),
'like_count': ('like_count', {int}),
'release_timestamp': ('published', {parse_iso8601}),
'tags': ('hashtags', ..., 'name', {str}),
}),
}
all_entries = traverse_obj(audio_info, ('chapters', ..., {
'id': ('id', {str_or_none}),
'title': ('name', {str}),
'duration': ('voice', 'duration', {float_or_none(scale=1000)}),
'manifest': ('voice', 'file', {url_or_none}),
'view_count': ('play_count', {int}),
}))
entries = []
for entry in all_entries:
if manifest := entry.pop('manifest', None):
ext = 'm4a' if 'audio_hls_aac' in manifest else 'mp3'
merged = merge_dicts(entry, channel_info, common_info, {
'formats': self._extract_m3u8_formats(manifest, audio_id, ext),
})
if len(all_entries) == 1:
return merged
entries.append(merged)
if not entries:
self.raise_login_required(
f'Premium(VIP) authentication required. {self._LOGIN_HINT}', method=None)
return self.playlist_result(entries, audio_id, audio_info['name'])
class VoicyLiveIE(VoicyBaseIE):
IE_NAME = 'voicy:live'
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/channel/\d+/live/(?P<id>\d+)'
_TESTS = [{
'url': 'https://voicy.jp/channel/1417/live/4858078',
'only_matching': True,
}]
def _real_extract(self, url):
audio_id = self._match_id(url)
live_info = self._call_api(
f'live/{audio_id}', audio_id, headers={'X-Platform': '3'})
if live_info['status'] != 'ended':
raise ExtractorError('WebRTC is not currently supported', expected=True)
if share_url := traverse_obj(live_info, ('archive', 'share_url', {url_or_none})):
return self.url_result(share_url, VoicyIE)
self.raise_no_formats(
'This livestream has ended and no archive is available', expected=True)
class VoicyPlaylistBaseIE(VoicyBaseIE):
def _entries(self, path, some_id, query, keys, ie=VoicyIE):
pagination = ''
for page in itertools.count(1):
info = self._call_api(
path, some_id, f'Downloading page {page}',
query={
'page_size': '100',
'page_token': pagination,
} | (query or {}),
)
yield from (self.url_result(s, ie) for s in traverse_obj(info, (*keys, 'share_url', {url_or_none})))
if not (pagination := traverse_obj(info, ('pagination', 'next_page_token', {str}))):
break
self._sleep(1, some_id)
class VoicyChannelIE(VoicyPlaylistBaseIE):
IE_NAME = 'voicy:channel'
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/channel/(?P<id>\d+)(?:/(?P<type>all|backnumber/\d+|premium))?(?:\?|$)'
_TESTS = [{
'url': 'https://voicy.jp/channel/3402',
'info_dict': {
'id': '3402',
'title': '安住紳一郎の日曜天国',
},
'playlist_mincount': 107,
}, {
'url': 'https://voicy.jp/channel/1/premium',
'info_dict': {
'id': '1',
'title': 'Voicy社長の頭の中',
},
'playlist_mincount': 145,
}, {
'url': 'https://voicy.jp/channel/2856/all',
'info_dict': {
'id': '2856',
'title': 'そんなこんなで、茅原実里です',
},
'playlist_mincount': 62,
}, {
'url': 'https://voicy.jp/channel/3321/all?type=all&month=202412',
'info_dict': {
'id': '3321',
'title': '海外安全チャンネル・りょーあん',
},
'playlist_count': 4,
}, {
'url': 'https://voicy.jp/channel/1417/backnumber/202501',
'info_dict': {
'id': '1417',
'title': '繪ほんの中には 公式チャンネル',
},
'playlist_count': 5,
}]
def _real_extract(self, url):
channel_id, _type = self._match_valid_url(url).groups()
channel_info = self._call_api(f'channel/{channel_id}', channel_id)
query = {
'filter_type' if k == 'type' else k: v[0]
for k, v in parse_qs(url).items() if v
} | {'channel_view_id': channel_id, 'order': 'new'}
if _type == 'premium':
query['filter_type'] = 'premium'
elif (ym := query.pop('month', None) or (_type not in ('all', None) and _type.split('/')[-1])):
y, m = map(int, (ym[:4], ym[4:]))
d = calendar.monthrange(y, m)[1]
query.update({
'from': f'{y}-{m:02d}-01T00:00:00+09:00',
'to': f'{y}-{m:02d}-{d}T23:59:59+09:00',
})
return self.playlist_result(self._entries(
'stories', channel_id, query, ('stories', ...),
), channel_id, channel_info['name'])
class VoicyTopicIE(VoicyPlaylistBaseIE):
IE_NAME = 'voicy:topic'
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/(?P<id>{})(?:/(?P<ctg_or_kwd>[\w%]+))?(?:/(?P<sub_ctg>[\w-]+))?'.format(
'|'.join(('audiobook', 'category', 'follow(?:ing-paystory)?', 'hashtag', 'paystory', 'pickup', 'search', 'voicedrama')))
_TESTS = [{
'url': 'https://voicy.jp/audiobook',
'info_dict': {
'id': '111',
'title': 'audiobook',
},
'playlist_mincount': 96,
}, {
'url': 'https://voicy.jp/category/talk/voiceactor-announcer',
'info_dict': {
'id': 'voiceactor-announcer',
'title': '声優・アナウンサー',
},
'playlist_mincount': 70,
}, {
'url': 'https://voicy.jp/category/sports/all',
'info_dict': {
'id': 'sports',
'title': 'スポーツ',
},
'playlist_mincount': 174,
}, {
'url': 'https://voicy.jp/hashtag/%E3%82%B9%E3%83%9E%E3%83%BC%E3%83%88%E5%AE%B6%E9%9B%BB',
'info_dict': {
'id': 'hashtag',
'title': 'スマート家電',
},
'playlist_mincount': 14,
}, {
'url': 'https://voicy.jp/search/%E6%81%B5%E6%96%B9%E5%B7%BB%E3%81%8D',
'info_dict': {
'id': 'search',
'title': '恵方巻き',
},
'playlist_mincount': 102,
}, {
'url': 'https://voicy.jp/follow',
'only_matching': True,
}]
def _real_extract(self, url):
topic = self._match_id(url)
topic_id, is_story, ie = {
'audiobook': ('111', False, VoicyChannelIE),
'category': ('category', False, VoicyChannelIE),
'follow': ('1', True, VoicyIE),
'following-paystory': ('94', True, VoicyIE),
'hashtag': ('hashtag', True, VoicyIE),
'paystory': ('70', True, VoicyIE),
'pickup': ('21', False, VoicyChannelIE),
'search': ('search', True, VoicyIE),
'voicedrama': ('93', False, VoicyChannelIE),
}[topic]
keys = ('channels', ..., *('story',) * is_story)
if topic == 'category':
ctg, sub_ctg = self._match_valid_url(url).group('ctg_or_kwd', 'sub_ctg')
category = topic_id = sub_ctg if (has_sub := sub_ctg != 'all') else ctg
category_id, topic = traverse_obj(self._call_api('channel/categories', None), (
*((..., 'subcategories') if has_sub else ()),
lambda _, v: v['view_id'] == category, ('id', 'name'), {str_or_none},
))
path = f'channel/categories/{category_id}'
query = {'exclude_story': 'true'}
elif topic in ('hashtag', 'search'):
keyword = self._match_valid_url(urllib.parse.unquote(url)).group('ctg_or_kwd')
if not keyword:
raise ExtractorError('Invalid URL', expected=True)
path = 'search/channels/story'
query = {
'search_type': {
'hashtag': 'hashtag',
'search': 'words',
}[topic],
'words': keyword,
}
topic = keyword
elif topic_id == '1':
path = 'user/me/channels/story/following'
query = {'series_filter': '2'}
else:
path = f'topics/channels{"/story" * is_story}'
query = {'topic_id': topic_id}
keys = (..., *keys)
return self.playlist_result(self._entries(path, topic_id, query, keys, ie), topic_id, topic)