[ie/Voicy] Rework extractor

pull/12470/head
doe1080 2 months ago
parent 0bb3978862
commit 728334a40e

@ -2389,6 +2389,8 @@ from .vodplatform import VODPlatformIE
from .voicy import (
VoicyChannelIE,
VoicyIE,
VoicyLiveIE,
VoicyTopicIE,
)
from .volejtv import VolejTVIE
from .voxmedia import (

@ -1,145 +1,375 @@
import calendar
import itertools
import time
import urllib.parse
from .common import InfoExtractor
from .wrestleuniverse import WrestleUniverseBaseIE
from ..utils import (
ExtractorError,
smuggle_url,
float_or_none,
merge_dicts,
parse_iso8601,
parse_qs,
str_or_none,
traverse_obj,
unified_strdate,
unsmuggle_url,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class VoicyBaseIE(InfoExtractor):
def _extract_from_playlist_data(self, value):
voice_id = str(value.get('PlaylistId'))
upload_date = unified_strdate(value.get('Published'), False)
items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']]
return {
'_type': 'multi_video',
'entries': items,
'id': voice_id,
'title': str(value.get('PlaylistName')),
'uploader': value.get('SpeakerName'),
'uploader_id': str_or_none(value.get('SpeakerId')),
'channel': value.get('ChannelName'),
'channel_id': str_or_none(value.get('ChannelId')),
'upload_date': upload_date,
}
class VoicyBaseIE(WrestleUniverseBaseIE):
_LOGIN_HEADERS = {
'Content-Type': 'application/json',
'X-Client-Version': 'Chrome/JsCore/10.13.2/FirebaseCore-web',
'X-Firebase-Gmpid': '1:212371279501:web:318567ddcbb953adcc5cc4',
}
_LOGIN_HINT = (
'Use --username refresh --password <refreshToken>, --username and --password, '
'--netrc-cmd, or --netrc (voicy) to provide account credentials')
_LOGIN_QUERY = {'key': 'AIzaSyC5Rg-sxiYu6ySD8V-f6Eljwll8gHvgUK4'}
_NETRC_MACHINE = 'voicy'
def _extract_single_article(self, entry):
formats = [{
'url': entry['VoiceHlsFile'],
'format_id': 'hls',
'ext': 'm4a',
'acodec': 'aac',
'vcodec': 'none',
'protocol': 'm3u8_native',
}, {
'url': entry['VoiceFile'],
'format_id': 'mp3',
'ext': 'mp3',
'acodec': 'mp3',
'vcodec': 'none',
}]
return {
'id': str(entry.get('ArticleId')),
'title': entry.get('ArticleTitle'),
'description': entry.get('MediaName'),
'formats': formats,
}
@WrestleUniverseBaseIE._TOKEN.getter
def _TOKEN(self):
if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()):
if not self._REFRESH_TOKEN:
self.raise_login_required(
f'No refreshToken provided. {self._LOGIN_HINT}', method=None)
self._refresh_token()
return self._REAL_TOKEN
def _perform_login(self, username, password):
if username.lower() == 'refresh':
self._REFRESH_TOKEN = password
return self._refresh_token()
return super()._perform_login(username, password)
def _call_api(self, url, video_id, **kwargs):
response = self._download_json(url, video_id, **kwargs)
if response.get('Status') != 0:
message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=str)
if not message:
message = 'There was a error in the response: %d' % response.get('Status')
raise ExtractorError(message, expected=False)
return response.get('Value')
def _call_api(self, path, some_id, note='Downloading JSON metadata', headers=None, query=None, fatal=True):
return self._download_json(
f'https://vmedia-player-api.voicy.jp/v1/{path}', some_id, note=note, headers={
'Authorization': f'Bearer {self._TOKEN}',
} | (headers or {}), query=query, fatal=fatal,
)
class VoicyIE(VoicyBaseIE):
_WORKING = False
IE_NAME = 'voicy'
_VALID_URL = r'https?://voicy\.jp/channel/(?P<channel_id>\d+)/(?P<id>\d+)'
ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s'
IE_DESC = 'Voicy'
_VALID_URL = [
r'https?://(?:www\.)?voicy\.jp/channel/(?P<channel>\d+)/(?P<id>\d+)',
r'http://r\.voicy\.jp/\w+',
]
_TESTS = [{
'url': 'https://voicy.jp/channel/1253/122754',
'url': 'https://voicy.jp/channel/3402/6361249',
'info_dict': {
'id': '122754',
'title': '1/21(木)声日記:ついに原稿終わった!!',
'uploader': 'ちょまど@ ITエンジニアなオタク',
'uploader_id': '7339',
'id': '8576738',
'ext': 'm4a',
'title': '2025.1.19「ブラジル沖の白石康次郎さん息子の成人式を『1年』間違えたお母さん」',
'categories': ['トーク', '声優・アナウンサー'],
'channel': '安住紳一郎の日曜天国',
'channel_id': '3402',
'comment_count': int,
'description': 'md5:f39bb238ff7661c3b7e8934f8578cf33',
'display_id': '6361249',
'duration': 1588.741,
'like_count': int,
'release_date': '20250119',
'release_timestamp': 1737272164,
'series': '2025.1.19「ブラジル沖の白石康次郎さん息子の成人式を『1年』間違えたお母さん」',
'series_id': '6361249',
'thumbnail': r're:https://files\.voicy\.jp/img/speaker/.+$',
'uploader': 'TBS RADIO',
'uploader_id': '17328',
'view_count': int,
},
'playlist_mincount': 9,
}, {
'url': 'https://voicy.jp/channel/3272/1141448',
'info_dict': {
'id': '2757390',
'ext': 'mp3',
'title': '5/3 お久しぶり雑談回',
'categories': ['トーク', '声優・アナウンサー'],
'channel': '松嵜麗のボイログ!',
'channel_id': '3272',
'comment_count': int,
'description': 'md5:4dee911d23cf1eedeb49687881878119',
'display_id': '1141448',
'duration': 433.24,
'like_count': int,
'release_date': '20240502',
'release_timestamp': 1714662728,
'series': '最近のわたし',
'series_id': '1141448',
'tags': ['最近のマイブーム'],
'thumbnail': r're:https://files\.voicy\.jp/img/speaker/.+$',
'uploader': '声優・松嵜麗',
'uploader_id': '16462',
'view_count': int,
},
}, {
'url': 'https://voicy.jp/channel/1417/6436213',
'info_dict': {
'id': '6436213',
'title': '第100回 グリム兄弟「麦のほ」',
},
'playlist_count': 5,
'skip': 'Only available for premium supporters',
}, {
'url': 'http://r.voicy.jp/7Qm2JbexmY6',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
assert mobj
voice_id = mobj.group('id')
channel_id = mobj.group('channel_id')
url, article_list = unsmuggle_url(url)
if not article_list:
article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id)
return self._extract_from_playlist_data(article_list)
class VoicyChannelIE(VoicyBaseIE):
_WORKING = False
if not url.startswith('https'):
return self.url_result(update_url(url, scheme='https'))
channel_id, audio_id = self._match_valid_url(url).groups()
channel_info = {
'channel_id': channel_id,
**traverse_obj(self._call_api(f'channel/{channel_id}', channel_id), {
'categories': ('category', ('name', ('subcategory', 'name'), {str})),
'channel': ('name', {str}),
'thumbnail': ('image', {url_or_none}),
'uploader': ('personality', 'name', {str}),
'uploader_id': ('personality', 'id', {str_or_none}),
}),
}
audio_info = self._call_api(f'channels/{channel_id}/stories/{audio_id}', audio_id)
common_info = {
'description': self._call_api(
f'channels/{channel_id}/stories/{audio_id}/description', audio_id, fatal=False,
).get('description'),
'display_id': audio_id,
'series': audio_info['name'],
'series_id': audio_id,
**traverse_obj(audio_info, {
'comment_count': ('comment_count', {int}),
'like_count': ('like_count', {int}),
'release_timestamp': ('published', {parse_iso8601}),
'tags': ('hashtags', ..., 'name', {str}),
}),
}
all_entries = traverse_obj(audio_info, ('chapters', ..., {
'id': ('id', {str_or_none}),
'title': ('name', {str}),
'duration': ('voice', 'duration', {float_or_none(scale=1000)}),
'manifest': ('voice', 'file', {url_or_none}),
'view_count': ('play_count', {int}),
}))
entries = []
for entry in all_entries:
if manifest := entry.pop('manifest', None):
ext = 'm4a' if 'audio_hls_aac' in manifest else 'mp3'
merged = merge_dicts(entry, channel_info, common_info, {
'formats': self._extract_m3u8_formats(manifest, audio_id, ext),
})
if len(all_entries) == 1:
return merged
entries.append(merged)
if not entries:
self.raise_login_required(
f'Premium(VIP) authentication required. {self._LOGIN_HINT}', method=None)
return self.playlist_result(entries, audio_id, audio_info['name'])
class VoicyLiveIE(VoicyBaseIE):
IE_NAME = 'voicy:live'
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/channel/\d+/live/(?P<id>\d+)'
_TESTS = [{
'url': 'https://voicy.jp/channel/1417/live/4858078',
'only_matching': True,
}]
def _real_extract(self, url):
audio_id = self._match_id(url)
live_info = self._call_api(
f'live/{audio_id}', audio_id, headers={'X-Platform': '3'})
if live_info['status'] != 'ended':
raise ExtractorError('WebRTC is not currently supported', expected=True)
if share_url := traverse_obj(live_info, ('archive', 'share_url', {url_or_none})):
return self.url_result(share_url, VoicyIE)
self.raise_no_formats(
'This livestream has ended and no archive is available', expected=True)
class VoicyPlaylistBaseIE(VoicyBaseIE):
def _entries(self, path, some_id, query, keys, ie=VoicyIE):
pagination = ''
for page in itertools.count(1):
info = self._call_api(
path, some_id, f'Downloading page {page}',
query={
'page_size': '100',
'page_token': pagination,
} | (query or {}),
)
yield from (self.url_result(s, ie) for s in traverse_obj(info, (*keys, 'share_url', {url_or_none})))
if not (pagination := traverse_obj(info, ('pagination', 'next_page_token', {str}))):
break
self._sleep(1, some_id)
class VoicyChannelIE(VoicyPlaylistBaseIE):
IE_NAME = 'voicy:channel'
_VALID_URL = r'https?://voicy\.jp/channel/(?P<id>\d+)'
PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s'
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/channel/(?P<id>\d+)(?:/(?P<type>all|backnumber/\d+|premium))?(?:\?|$)'
_TESTS = [{
'url': 'https://voicy.jp/channel/1253/',
'url': 'https://voicy.jp/channel/3402',
'info_dict': {
'id': '3402',
'title': '安住紳一郎の日曜天国',
},
'playlist_mincount': 107,
}, {
'url': 'https://voicy.jp/channel/1/premium',
'info_dict': {
'id': '1',
'title': 'Voicy社長の頭の中',
},
'playlist_mincount': 145,
}, {
'url': 'https://voicy.jp/channel/2856/all',
'info_dict': {
'id': '2856',
'title': 'そんなこんなで、茅原実里です',
},
'playlist_mincount': 62,
}, {
'url': 'https://voicy.jp/channel/3321/all?type=all&month=202412',
'info_dict': {
'id': '7339',
'title': 'ゆるふわ日常ラジオ #ちょまラジ',
'uploader': 'ちょまど@ ITエンジニアなオタク',
'uploader_id': '7339',
'id': '3321',
'title': '海外安全チャンネル・りょーあん',
},
'playlist_mincount': 54,
'playlist_count': 4,
}, {
'url': 'https://voicy.jp/channel/1417/backnumber/202501',
'info_dict': {
'id': '1417',
'title': '繪ほんの中には 公式チャンネル',
},
'playlist_count': 5,
}]
@classmethod
def suitable(cls, url):
return not VoicyIE.suitable(url) and super().suitable(url)
def _real_extract(self, url):
channel_id, _type = self._match_valid_url(url).groups()
channel_info = self._call_api(f'channel/{channel_id}', channel_id)
query = {
'filter_type' if k == 'type' else k: v[0]
for k, v in parse_qs(url).items() if v
} | {'channel_view_id': channel_id, 'order': 'new'}
def _entries(self, channel_id):
pager = ''
for count in itertools.count(1):
article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note=f'Paging #{count}')
playlist_data = article_list.get('PlaylistData')
if not playlist_data:
break
yield from playlist_data
last = playlist_data[-1]
pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount'])
if _type == 'premium':
query['filter_type'] = 'premium'
elif (ym := query.pop('month', None) or (_type not in ('all', None) and _type.split('/')[-1])):
y, m = map(int, (ym[:4], ym[4:]))
d = calendar.monthrange(y, m)[1]
query.update({
'from': f'{y}-{m:02d}-01T00:00:00+09:00',
'to': f'{y}-{m:02d}-{d}T23:59:59+09:00',
})
return self.playlist_result(self._entries(
'stories', channel_id, query, ('stories', ...),
), channel_id, channel_info['name'])
class VoicyTopicIE(VoicyPlaylistBaseIE):
IE_NAME = 'voicy:topic'
_VALID_URL = r'https?://(?:www\.)?voicy\.jp/(?P<id>{})(?:/(?P<ctg_or_kwd>[\w%]+))?(?:/(?P<sub_ctg>[\w-]+))?'.format(
'|'.join(('audiobook', 'category', 'follow(?:ing-paystory)?', 'hashtag', 'paystory', 'pickup', 'search', 'voicedrama')))
_TESTS = [{
'url': 'https://voicy.jp/audiobook',
'info_dict': {
'id': '111',
'title': 'audiobook',
},
'playlist_mincount': 96,
}, {
'url': 'https://voicy.jp/category/talk/voiceactor-announcer',
'info_dict': {
'id': 'voiceactor-announcer',
'title': '声優・アナウンサー',
},
'playlist_mincount': 70,
}, {
'url': 'https://voicy.jp/category/sports/all',
'info_dict': {
'id': 'sports',
'title': 'スポーツ',
},
'playlist_mincount': 174,
}, {
'url': 'https://voicy.jp/hashtag/%E3%82%B9%E3%83%9E%E3%83%BC%E3%83%88%E5%AE%B6%E9%9B%BB',
'info_dict': {
'id': 'hashtag',
'title': 'スマート家電',
},
'playlist_mincount': 14,
}, {
'url': 'https://voicy.jp/search/%E6%81%B5%E6%96%B9%E5%B7%BB%E3%81%8D',
'info_dict': {
'id': 'search',
'title': '恵方巻き',
},
'playlist_mincount': 102,
}, {
'url': 'https://voicy.jp/follow',
'only_matching': True,
}]
def _real_extract(self, url):
channel_id = self._match_id(url)
articles = self._entries(channel_id)
first_article = next(articles, None)
title = traverse_obj(first_article, ('ChannelName', ), expected_type=str)
speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=str)
if not title and speaker_name:
title = f'Uploads from {speaker_name}'
if not title:
title = f'Uploads from channel ID {channel_id}'
articles = itertools.chain([first_article], articles) if first_article else articles
playlist = (
self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key())
for value in articles)
return {
'_type': 'playlist',
'entries': playlist,
'id': channel_id,
'title': title,
'channel': speaker_name,
'channel_id': channel_id,
}
topic = self._match_id(url)
topic_id, is_story, ie = {
'audiobook': ('111', False, VoicyChannelIE),
'category': ('category', False, VoicyChannelIE),
'follow': ('1', True, VoicyIE),
'following-paystory': ('94', True, VoicyIE),
'hashtag': ('hashtag', True, VoicyIE),
'paystory': ('70', True, VoicyIE),
'pickup': ('21', False, VoicyChannelIE),
'search': ('search', True, VoicyIE),
'voicedrama': ('93', False, VoicyChannelIE),
}[topic]
keys = ('channels', ..., *('story',) * is_story)
if topic == 'category':
ctg, sub_ctg = self._match_valid_url(url).group('ctg_or_kwd', 'sub_ctg')
category = topic_id = sub_ctg if (has_sub := sub_ctg != 'all') else ctg
category_id, topic = traverse_obj(self._call_api('channel/categories', None), (
*((..., 'subcategories') if has_sub else ()),
lambda _, v: v['view_id'] == category, ('id', 'name'), {str_or_none},
))
path = f'channel/categories/{category_id}'
query = {'exclude_story': 'true'}
elif topic in ('hashtag', 'search'):
keyword = self._match_valid_url(urllib.parse.unquote(url)).group('ctg_or_kwd')
if not keyword:
raise ExtractorError('Invalid URL', expected=True)
path = 'search/channels/story'
query = {
'search_type': {
'hashtag': 'hashtag',
'search': 'words',
}[topic],
'words': keyword,
}
topic = keyword
elif topic_id == '1':
path = 'user/me/channels/story/following'
query = {'series_filter': '2'}
else:
path = f'topics/channels{"/story" * is_story}'
query = {'topic_id': topic_id}
keys = (..., *keys)
return self.playlist_result(self._entries(path, topic_id, query, keys, ie), topic_id, topic)

Loading…
Cancel
Save