[extractor/niconico:live] Add extractor (#5764)

Authored by: Lesmiscore
2 years ago · f8f9250fe2
parent 3459d3c5af
commit f8f9250fe2
4 changed files with 266 additions and 2 deletions
--- a/yt_dlp/downloader/init.py
+++ b/yt_dlp/downloader/init.py
@ -30,7 +30,7 @@ from .hls import HlsFD
 from .http import HttpFD
 from .ism import IsmFD
 from .mhtml import MhtmlFD
-from .niconico import NiconicoDmcFD
+from .niconico import NiconicoDmcFD, NiconicoLiveFD
 from .rtmp import RtmpFD
 from .rtsp import RtspFD
 from .websocket import WebSocketFragmentFD
@ -50,6 +50,7 @@ PROTOCOL_MAP = {
    'ism': IsmFD,
    'mhtml': MhtmlFD,
    'niconico_dmc': NiconicoDmcFD,
+    'niconico_live': NiconicoLiveFD,
    'fc2_live': FC2LiveFD,
    'websocket_frag': WebSocketFragmentFD,
    'youtube_live_chat': YoutubeLiveChatFD,
--- a/yt_dlp/downloader/niconico.py
+++ b/yt_dlp/downloader/niconico.py
@ -1,8 +1,17 @@
+import json
 import threading
+import time

 from . import get_suitable_downloader
 from .common import FileDownloader
-from ..utils import sanitized_Request
+from .external import FFmpegFD
+from ..utils import (
+    DownloadError,
+    str_or_none,
+    sanitized_Request,
+    WebSocketsWrapper,
+    try_get,
+)


 class NiconicoDmcFD(FileDownloader):
@ -50,3 +59,93 @@ class NiconicoDmcFD(FileDownloader):
                    timer[0].cancel()
                    download_complete = True
        return success
+
+
+class NiconicoLiveFD(FileDownloader):
+    """ Downloads niconico live without being stopped """
+
+    def real_download(self, filename, info_dict):
+        video_id = info_dict['video_id']
+        ws_url = info_dict['url']
+        ws_extractor = info_dict['ws']
+        ws_origin_host = info_dict['origin']
+        cookies = info_dict.get('cookies')
+        live_quality = info_dict.get('live_quality', 'high')
+        live_latency = info_dict.get('live_latency', 'high')
+        dl = FFmpegFD(self.ydl, self.params or {})
+
+        new_info_dict = info_dict.copy()
+        new_info_dict.update({
+            'protocol': 'm3u8',
+        })
+
+        def communicate_ws(reconnect):
+            if reconnect:
+                ws = WebSocketsWrapper(ws_url, {
+                    'Cookies': str_or_none(cookies) or '',
+                    'Origin': f'https://{ws_origin_host}',
+                    'Accept': '*/*',
+                    'User-Agent': self.params['http_headers']['User-Agent'],
+                })
+                if self.ydl.params.get('verbose', False):
+                    self.to_screen('[debug] Sending startWatching request')
+                ws.send(json.dumps({
+                    'type': 'startWatching',
+                    'data': {
+                        'stream': {
+                            'quality': live_quality,
+                            'protocol': 'hls+fmp4',
+                            'latency': live_latency,
+                            'chasePlay': False
+                        },
+                        'room': {
+                            'protocol': 'webSocket',
+                            'commentable': True
+                        },
+                        'reconnect': True,
+                    }
+                }))
+            else:
+                ws = ws_extractor
+            with ws:
+                while True:
+                    recv = ws.recv()
+                    if not recv:
+                        continue
+                    data = json.loads(recv)
+                    if not data or not isinstance(data, dict):
+                        continue
+                    if data.get('type') == 'ping':
+                        # pong back
+                        ws.send(r'{"type":"pong"}')
+                        ws.send(r'{"type":"keepSeat"}')
+                    elif data.get('type') == 'disconnect':
+                        self.write_debug(data)
+                        return True
+                    elif data.get('type') == 'error':
+                        self.write_debug(data)
+                        message = try_get(data, lambda x: x['body']['code'], str) or recv
+                        return DownloadError(message)
+                    elif self.ydl.params.get('verbose', False):
+                        if len(recv) > 100:
+                            recv = recv[:100] + '...'
+                        self.to_screen('[debug] Server said: %s' % recv)
+
+        def ws_main():
+            reconnect = False
+            while True:
+                try:
+                    ret = communicate_ws(reconnect)
+                    if ret is True:
+                        return
+                except BaseException as e:
+                    self.to_screen('[%s] %s: Connection error occured, reconnecting after 10 seconds: %s' % ('niconico:live', video_id, str_or_none(e)))
+                    time.sleep(10)
+                    continue
+                finally:
+                    reconnect = True
+
+        thread = threading.Thread(target=ws_main, daemon=True)
+        thread.start()
+
+        return dl.download(filename, new_info_dict)
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1275,6 +1275,7 @@ from .niconico import (
    NicovideoSearchIE,
    NicovideoSearchURLIE,
    NicovideoTagURLIE,
+    NiconicoLiveIE,
 )
 from .ninecninemedia import (
    NineCNineMediaIE,
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@ -5,13 +5,17 @@ import json
 import re
 import time

+from urllib.parse import urlparse
+
 from .common import InfoExtractor, SearchInfoExtractor
 from ..compat import (
    compat_HTTPError,
 )
+from ..dependencies import websockets
 from ..utils import (
    ExtractorError,
    OnDemandPagedList,
+    WebSocketsWrapper,
    bug_reports_message,
    clean_html,
    float_or_none,
@ -895,3 +899,162 @@ class NiconicoUserIE(InfoExtractor):
    def _real_extract(self, url):
        list_id = self._match_id(url)
        return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key())
+
+
+class NiconicoLiveIE(InfoExtractor):
+    IE_NAME = 'niconico:live'
+    IE_DESC = 'ニコニコ生放送'
+    _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?P<id>lv\d+)'
+    _TESTS = [{
+        'note': 'this test case includes invisible characters for title, pasting them as-is',
+        'url': 'https://live.nicovideo.jp/watch/lv339533123',
+        'info_dict': {
+            'id': 'lv339533123',
+            'title': '激辛ペヤング食べます‪( ;ᯅ; )‬（歌枠オーディション参加中）',
+            'view_count': 1526,
+            'comment_count': 1772,
+            'description': '初めましてもかって言います❕\nのんびり自由に適当に暮らしてます',
+            'uploader': 'もか',
+            'channel': 'ゲストさんのコミュニティ',
+            'channel_id': 'co5776900',
+            'channel_url': 'https://com.nicovideo.jp/community/co5776900',
+            'timestamp': 1670677328,
+            'is_live': True,
+        },
+        'skip': 'livestream',
+    }, {
+        'url': 'https://live2.nicovideo.jp/watch/lv339533123',
+        'only_matching': True,
+    }, {
+        'url': 'https://sp.live.nicovideo.jp/watch/lv339533123',
+        'only_matching': True,
+    }, {
+        'url': 'https://sp.live2.nicovideo.jp/watch/lv339533123',
+        'only_matching': True,
+    }]
+
+    _KNOWN_LATENCY = ('high', 'low')
+
+    def _real_extract(self, url):
+        if not websockets:
+            raise ExtractorError('websockets library is not available. Please install it.', expected=True)
+        video_id = self._match_id(url)
+        webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id)
+
+        embedded_data = self._parse_json(unescapeHTML(self._search_regex(
+            r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id)
+
+        ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl'))
+        if not ws_url:
+            raise ExtractorError('The live hasn\'t started yet or already ended.', expected=True)
+        ws_url = update_url_query(ws_url, {
+            'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9',
+        })
+
+        hostname = remove_start(urlparse(urlh.geturl()).hostname, 'sp.')
+        cookies = try_get(urlh.geturl(), self._downloader._calc_cookies)
+        latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
+        if latency not in self._KNOWN_LATENCY:
+            latency = 'high'
+
+        ws = WebSocketsWrapper(ws_url, {
+            'Cookies': str_or_none(cookies) or '',
+            'Origin': f'https://{hostname}',
+            'Accept': '*/*',
+            'User-Agent': self.get_param('http_headers')['User-Agent'],
+        })
+
+        self.write_debug('[debug] Sending HLS server request')
+        ws.send(json.dumps({
+            'type': 'startWatching',
+            'data': {
+                'stream': {
+                    'quality': 'abr',
+                    'protocol': 'hls+fmp4',
+                    'latency': latency,
+                    'chasePlay': False
+                },
+                'room': {
+                    'protocol': 'webSocket',
+                    'commentable': True
+                },
+                'reconnect': False,
+            }
+        }))
+
+        while True:
+            recv = ws.recv()
+            if not recv:
+                continue
+            data = json.loads(recv)
+            if not isinstance(data, dict):
+                continue
+            if data.get('type') == 'stream':
+                m3u8_url = data['data']['uri']
+                qualities = data['data']['availableQualities']
+                break
+            elif data.get('type') == 'disconnect':
+                self.write_debug(recv)
+                raise ExtractorError('Disconnected at middle of extraction')
+            elif data.get('type') == 'error':
+                self.write_debug(recv)
+                message = traverse_obj(data, ('body', 'code')) or recv
+                raise ExtractorError(message)
+            elif self.get_param('verbose', False):
+                if len(recv) > 100:
+                    recv = recv[:100] + '...'
+                self.write_debug('Server said: %s' % recv)
+
+        title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta(
+            ('og:title', 'twitter:title'), webpage, 'live title', fatal=False)
+
+        raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail')) or {}
+        thumbnails = []
+        for name, value in raw_thumbs.items():
+            if not isinstance(value, dict):
+                thumbnails.append({
+                    'id': name,
+                    'url': value,
+                    **parse_resolution(value, lenient=True),
+                })
+                continue
+
+            for k, img_url in value.items():
+                res = parse_resolution(k, lenient=True) or parse_resolution(img_url, lenient=True)
+                width, height = res.get('width'), res.get('height')
+
+                thumbnails.append({
+                    'id': f'{name}_{width}x{height}',
+                    'url': img_url,
+                    **res,
+                })
+
+        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True)
+        for fmt, q in zip(formats, reversed(qualities[1:])):
+            fmt.update({
+                'format_id': q,
+                'protocol': 'niconico_live',
+                'ws': ws,
+                'video_id': video_id,
+                'cookies': cookies,
+                'live_latency': latency,
+                'origin': hostname,
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            **traverse_obj(embedded_data, {
+                'view_count': ('program', 'statistics', 'watchCount'),
+                'comment_count': ('program', 'statistics', 'commentCount'),
+                'uploader': ('program', 'supplier', 'name'),
+                'channel': ('socialGroup', 'name'),
+                'channel_id': ('socialGroup', 'id'),
+                'channel_url': ('socialGroup', 'socialGroupPageUrl'),
+            }),
+            'description': clean_html(traverse_obj(embedded_data, ('program', 'description'))),
+            'timestamp': int_or_none(traverse_obj(embedded_data, ('program', 'openTime'))),
+            'is_live': True,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }