[voicy] Add extractor (#667)

Authored by: nao20010128nao
4 years ago · e040bb0a41
parent f8fabc9930
commit e040bb0a41
2 changed files with 152 additions and 0 deletions
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@ -1605,6 +1605,10 @@ from .vodlocker import VodlockerIE
 from .vodpl import VODPlIE
 from .vodplatform import VODPlatformIE
 from .voicerepublic import VoiceRepublicIE
+from .voicy import (
+    VoicyIE,
+    VoicyChannelIE,
+)
 from .voot import (
    VootIE,
    VootSeriesIE,
--- a/yt_dlp/extractor/voicy.py
+++ b/yt_dlp/extractor/voicy.py
@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    smuggle_url,
+    traverse_obj,
+    unsmuggle_url,
+    unified_strdate,
+)
+
+import re
+import itertools
+
+
+class VoicyBaseIE(InfoExtractor):
+    def _extract_from_playlist_data(self, value):
+        voice_id = compat_str(value.get('PlaylistId'))
+        upload_date = unified_strdate(value.get('Published'), False)
+        items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']]
+        return {
+            '_type': 'multi_video',
+            'entries': items,
+            'id': voice_id,
+            'title': compat_str(value.get('PlaylistName')),
+            'uploader': value.get('SpeakerName'),
+            'uploader_id': compat_str(value.get('SpeakerId')),
+            'channel': value.get('ChannelName'),
+            'channel_id': compat_str(value.get('ChannelId')),
+            'upload_date': upload_date,
+        }
+
+    def _extract_single_article(self, entry):
+        formats = [{
+            'url': entry['VoiceHlsFile'],
+            'format_id': 'hls',
+            'ext': 'm4a',
+            'acodec': 'aac',
+            'vcodec': 'none',
+            'protocol': 'm3u8_native',
+        }, {
+            'url': entry['VoiceFile'],
+            'format_id': 'mp3',
+            'ext': 'mp3',
+            'acodec': 'mp3',
+            'vcodec': 'none',
+        }]
+        self._sort_formats(formats)
+        return {
+            'id': compat_str(entry.get('ArticleId')),
+            'title': entry.get('ArticleTitle'),
+            'description': entry.get('MediaName'),
+            'formats': formats,
+        }
+
+    def _call_api(self, url, video_id, **kwargs):
+        response = self._download_json(url, video_id, **kwargs)
+        if response.get('Status') != 0:
+            message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=compat_str)
+            if not message:
+                message = 'There was a error in the response: %d' % response.get('Status')
+            raise ExtractorError(message, expected=False)
+        return response.get('Value')
+
+
+class VoicyIE(VoicyBaseIE):
+    IE_NAME = 'voicy'
+    _VALID_URL = r'https?://voicy\.jp/channel/(?P<channel_id>\d+)/(?P<id>\d+)'
+    ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s'
+    _TESTS = [{
+        'url': 'https://voicy.jp/channel/1253/122754',
+        'info_dict': {
+            'id': '122754',
+            'title': '1/21(木)声日記：ついに原稿終わった！！',
+            'uploader': 'ちょまど@ ITエンジニアなオタク',
+            'uploader_id': '7339',
+        },
+        'playlist_mincount': 9,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        assert mobj
+        voice_id = mobj.group('id')
+        channel_id = mobj.group('channel_id')
+        url, article_list = unsmuggle_url(url)
+        if not article_list:
+            article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id)
+        return self._extract_from_playlist_data(article_list)
+
+
+class VoicyChannelIE(VoicyBaseIE):
+    IE_NAME = 'voicy:channel'
+    _VALID_URL = r'https?://voicy\.jp/channel/(?P<id>\d+)'
+    PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s'
+    _TESTS = [{
+        'url': 'https://voicy.jp/channel/1253/',
+        'info_dict': {
+            'id': '7339',
+            'title': 'ゆるふわ日常ラジオ #ちょまラジ',
+            'uploader': 'ちょまど@ ITエンジニアなオタク',
+            'uploader_id': '7339',
+        },
+        'playlist_mincount': 54,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return not VoicyIE.suitable(url) and super(VoicyChannelIE, cls).suitable(url)
+
+    def _entries(self, channel_id):
+        pager = ''
+        for count in itertools.count(1):
+            article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note='Paging #%d' % count)
+            playlist_data = article_list.get('PlaylistData')
+            if not playlist_data:
+                break
+            yield from playlist_data
+            last = playlist_data[-1]
+            pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount'])
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+        articles = self._entries(channel_id)
+
+        first_article = next(articles, None)
+        title = traverse_obj(first_article, ('ChannelName', ), expected_type=compat_str)
+        speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=compat_str)
+        if not title and speaker_name:
+            title = 'Uploads from %s' % speaker_name
+        if not title:
+            title = 'Uploads from channel ID %s' % channel_id
+
+        articles = itertools.chain([first_article], articles) if first_article else articles
+
+        playlist = (
+            self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key())
+            for value in articles)
+        return {
+            '_type': 'playlist',
+            'entries': playlist,
+            'id': channel_id,
+            'title': title,
+            'channel': speaker_name,
+            'channel_id': channel_id,
+        }