From f05f3a9ef5e64ffcc6b5dacc4617df03a01247b3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 23 Mar 2025 08:18:59 +0000 Subject: [PATCH] revert formatting changes, refactor/rewrite --- yt_dlp/extractor/tver.py | 510 ++++++++++++++++++--------------------- 1 file changed, 238 insertions(+), 272 deletions(-) diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index 68121fb7ad..c37fa154aa 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -1,128 +1,113 @@ -import datetime +import datetime as dt +import json import re +import urllib.parse from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, + int_or_none, join_nonempty, + mimetype2ext, + parse_iso8601, qualities, smuggle_url, str_or_none, strip_or_none, - traverse_obj, update_url_query, + url_or_none, ) +from ..utils.traversal import require, traverse_obj class TVerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature)/)+(?P[a-zA-Z0-9]+)' - _TESTS = [ - { - 'skip': 'videos are only available for 7 days', - 'url': 'https://tver.jp/episodes/ep83nf3w4p', - 'info_dict': { - 'title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'description': 'md5:dc2c06b6acc23f1e7c730c513737719b', - 'series': '家事ヤロウ!!!', - 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'channel': 'テレビ朝日', - 'id': 'ep83nf3w4p', - 'ext': 'mp4', - }, + _GEO_COUNTRIES = ['JP'] + _GEO_BYPASS = False + _TESTS = [{ + 'skip': 'videos are only available for 7 days', + 'url': 'https://tver.jp/episodes/ep83nf3w4p', + 'info_dict': { + 'title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'description': 'md5:dc2c06b6acc23f1e7c730c513737719b', + 'series': '家事ヤロウ!!!', + 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'channel': 'テレビ朝日', + 'id': 'ep83nf3w4p', + 'ext': 'mp4', }, - { - 'url': 'https://tver.jp/corner/f0103888', - 'only_matching': True, + }, { + 'url': 'https://tver.jp/corner/f0103888', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/lp/f0033031', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/series/srtxft431v', + 'info_dict': { + 'id': 'srtxft431v', + 'title': '名探偵コナン', }, - { - 'url': 'https://tver.jp/lp/f0033031', - 'only_matching': True, - }, - { - 'url': 'https://tver.jp/series/srtxft431v', - 'info_dict': { - 'id': 'srtxft431v', - 'title': '名探偵コナン', - }, - 'playlist': [ - { - 'md5': '779ffd97493ed59b0a6277ea726b389e', - 'info_dict': { - 'id': 'ref:conan-1137-241005', - 'ext': 'mp4', - 'title': '名探偵コナン #1137「行列店、味変の秘密」', - 'uploader_id': '5330942432001', - 'tags': [], - 'channel': '読売テレビ', - 'series': '名探偵コナン', - 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', - 'episode': '#1137「行列店、味変の秘密」', - 'duration': 1469.077, - 'timestamp': 1728030405, - 'upload_date': '20241004', - 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', - 'thumbnail': r're:https://.+\.jpg', - }, + 'playlist': [ + { + 'md5': '779ffd97493ed59b0a6277ea726b389e', + 'info_dict': { + 'id': 'ref:conan-1137-241005', + 'ext': 'mp4', + 'title': '名探偵コナン #1137「行列店、味変の秘密」', + 'uploader_id': '5330942432001', + 'tags': [], + 'channel': '読売テレビ', + 'series': '名探偵コナン', + 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', + 'episode': '#1137「行列店、味変の秘密」', + 'duration': 1469.077, + 'timestamp': 1728030405, + 'upload_date': '20241004', + 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', + 'thumbnail': r're:https://.+\.jpg', }, - ], - }, - { - 'url': 'https://tver.jp/series/sru35hwdd2', - 'info_dict': { - 'id': 'sru35hwdd2', - 'title': '神回だけ見せます!', - }, - 'playlist_count': 11, + }], + }, { + 'url': 'https://tver.jp/series/sru35hwdd2', + 'info_dict': { + 'id': 'sru35hwdd2', + 'title': '神回だけ見せます!', }, - { - 'url': 'https://tver.jp/series/srkq2shp9d', - 'only_matching': True, - }, - ] + 'playlist_count': 11, + }, { + 'url': 'https://tver.jp/series/srkq2shp9d', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - STREAKS_URL_TEMPLATE = 'https://playback.api.streaks.jp/v1/projects/%s/medias/%s' - _HEADERS = { 'x-tver-platform-type': 'web', - 'origin': 'https://tver.jp/', - 'referer': 'https://tver.jp/', + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', } _PLATFORM_QUERY = {} def _real_initialize(self): session_info = self._download_json( 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', - None, - 'Creating session', - data=b'device_type=pc', - ) - self._PLATFORM_QUERY = traverse_obj( - session_info, - ( - 'result', - { - 'platform_uid': 'platform_uid', - 'platform_token': 'platform_token', - }, - ), - ) + None, 'Creating session', data=b'device_type=pc') + self._PLATFORM_QUERY = traverse_obj(session_info, ('result', { + 'platform_uid': 'platform_uid', + 'platform_token': 'platform_token', + })) def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None): return self._download_json( - f'https://platform-api.tver.jp/service/api/{path}', - video_id, - note, - fatal=fatal, - headers=self._HEADERS, - query={ + f'https://platform-api.tver.jp/service/api/{path}', video_id, note, + fatal=fatal, headers=self._HEADERS, query={ **self._PLATFORM_QUERY, **(query or {}), - }, - ) + }) def _yield_episode_ids_for_series(self, series_id): seasons_info = self._download_json( @@ -132,76 +117,49 @@ class TVerIE(InfoExtractor): headers=self._HEADERS, ) for season_id in traverse_obj( - seasons_info, - ('result', 'contents', lambda _, v: v['type'] == 'season', 'content', 'id', {str}), - ): + seasons_info, ('result', 'contents', lambda _, v: v['type'] == 'season', 'content', 'id', {str})): episodes_info = self._call_platform_api( - f'v1/callSeasonEpisodes/{season_id}', - series_id, - f'Downloading season {season_id} episodes info', - ) - yield from traverse_obj( - episodes_info, - ( - 'result', - 'contents', - lambda _, v: v['type'] == 'episode', - 'content', - 'id', - {str}, - ), - ) + f'v1/callSeasonEpisodes/{season_id}', series_id, f'Downloading season {season_id} episodes info') + yield from traverse_obj(episodes_info, ( + 'result', 'contents', lambda _, v: v['type'] == 'episode', 'content', 'id', {str})) def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') + backend = self._configuration_arg('backend', ['streaks'])[0] + if backend not in ('brightcove', 'streaks'): + raise ExtractorError(f'Invalid backend value: {backend}', expected=True) if video_type == 'series': - series_info = self._call_platform_api(f'v2/callSeries/{video_id}', video_id, 'Downloading series info') + series_info = self._call_platform_api( + f'v2/callSeries/{video_id}', video_id, 'Downloading series info') return self.playlist_from_matches( - self._yield_episode_ids_for_series(video_id), - video_id, + self._yield_episode_ids_for_series(video_id), video_id, traverse_obj(series_info, ('result', 'content', 'content', 'title', {str})), - ie=TVerIE, - getter=lambda x: f'https://tver.jp/episodes/{x}', - ) + ie=TVerIE, getter=lambda x: f'https://tver.jp/episodes/{x}') if video_type != 'episodes': webpage = self._download_webpage(url, video_id, note='Resolving to new URL') - video_id = self._match_id( - self._search_regex( - ( - r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', - r'&link=(https?://tver\.jp/[^?&]+)[?&]', - ), - webpage, - 'url regex', - ), - ) + video_id = self._match_id(self._search_regex( + (r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'), + webpage, 'url regex')) episode_info = self._call_platform_api( - f'v1/callEpisode/{video_id}', - video_id, - 'Downloading episode info', - fatal=False, - query={ + f'v1/callEpisode/{video_id}', video_id, 'Downloading episode info', fatal=False, query={ 'require_data': 'mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', - }, - ) - episode_content = traverse_obj(episode_info, ('result', 'episode', 'content')) or {} + }) + episode_content = traverse_obj( + episode_info, ('result', 'episode', 'content')) or {} version = traverse_obj(episode_content, ('version', {str_or_none}), default='5') - video_info = self._download_json( - f'https://statics.tver.jp/content/episode/{video_id}.json', - video_id, - 'Downloading video info', - query={'v': version}, - headers={'Referer': 'https://tver.jp/'}, - ) + f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, 'Downloading video info', + query={'v': version}, headers={'Referer': 'https://tver.jp/'}) episode = strip_or_none(episode_content.get('title')) series = str_or_none(episode_content.get('seriesTitle')) - title = join_nonempty(series, episode, delim=' ') or str_or_none(video_info.get('title')) + title = ( + join_nonempty(series, episode, delim=' ') + or str_or_none(video_info.get('title'))) provider = str_or_none(episode_content.get('productionProviderName')) onair_label = str_or_none(episode_content.get('broadcastDateLabel')) @@ -210,8 +168,7 @@ class TVerIE(InfoExtractor): 'id': quality, 'url': update_url_query( f'https://statics.tver.jp/images/content/thumbnail/episode/{quality}/{video_id}.jpg', - {'v': version}, - ), + {'v': version}), 'width': width, 'height': height, } @@ -223,39 +180,52 @@ class TVerIE(InfoExtractor): ] ] - data = { + metadata = { 'title': title, 'series': series, 'episode': episode, # an another title which is considered "full title" for some viewers 'alt_title': join_nonempty(title, provider, onair_label, delim=' '), 'channel': provider, - 'description': str_or_none(video_info.get('description')), 'thumbnails': thumbnails, + **traverse_obj(video_info, { + 'description': ('description', {str}), + 'timestamp': ('viewStatus', 'startAt', {int_or_none}), + 'episode_number': ('no', {int_or_none}), + }), } - ts = traverse_obj(video_info, ('viewStatus', 'startAt', {int}), default=None) - if ts: - data['timestamp'] = ts - - episode_number = traverse_obj(video_info, ('no', {str_or_none}), default=None) - if episode_number: - data['episode_number'] = int(episode_number) - if onair_label: - data.update(self._format_broadcast_date(onair_label)) - - backend = self._configuration_arg('backend', ['streaks'])[0] - - if backend not in ('brightcove', 'streaks'): - raise ExtractorError(f'Invalid backend value: {backend}', expected=True) + metadata.update(self._format_broadcast_date(onair_label)) if backend == 'brightcove': - data = self._brightcove_backend(data, video_info) - else: - data = self._streaks_backend(data, video_info, video_id) + p_id = video_info['video']['accountID'] + r_id = traverse_obj(video_info, ( + 'video', ('videoRefID', 'videoID'), {str}, any, {require('reference ID')})) + if not r_id.isdecimal(): + r_id = f'ref:{r_id}' + + return { + **metadata, + '_type': 'url_transparent', + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}), + 'ie_key': 'BrightcoveNew', + } - return data + ref_id = video_info['streaks']['videoRefID'] + if not ref_id.startswith('ref:'): + ref_id = f'ref:{ref_id}' + + return { + **self._extract_from_streaks_api(video_info['streaks']['projectID'], ref_id, { + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', + }), + **metadata, + 'id': video_id, + } def _format_broadcast_date(self, onair_label): """ @@ -293,17 +263,17 @@ class TVerIE(InfoExtractor): if 'release_year' in data: year = data['release_year'] else: - year = datetime.datetime.now().year - dt = datetime.datetime.strptime( + year = dt.datetime.now().year + dt_ = dt.datetime.strptime( f"{year}-{broadcast_date['month']}-{broadcast_date['day']}", '%Y-%m-%d', ) # if the date in the future, it means the broadcast date is in the previous year # ref: https://github.com/yt-dlp/yt-dlp/pull/12282#issuecomment-2678132806 - if dt > datetime.datetime.now(): + if dt_ > dt.datetime.now(): year -= 1 - data['release_timestamp'] = datetime.datetime( + data['release_timestamp'] = dt.datetime( year=year, month=int(broadcast_date['month']), day=int(broadcast_date['day']), @@ -311,115 +281,111 @@ class TVerIE(InfoExtractor): return data - def _brightcove_backend(self, result, video_info): - self.write_debug('Using Brightcove backend') - - p_id = video_info['video']['accountID'] - r_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID')), get_all=False) - - if not r_id: - raise ExtractorError('Failed to extract reference ID for Brightcove') - - if not r_id.isdigit(): - r_id = f'ref:{r_id}' - - result.update( - { - '_type': 'url_transparent', - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}, - ), - 'ie_key': 'BrightcoveNew', - }, - ) - - return result - - def _streaks_backend(self, result, video_info, video_id): - self.write_debug('Using streaks.jp backend') - - ref_id = traverse_obj(video_info, ('streaks', 'videoRefID'), get_all=False) - project_id = traverse_obj(video_info, ('streaks', 'projectID'), get_all=False) - - if not ref_id: - raise ExtractorError('Failed to extract reference ID for streaks.jp stream info') - - if not project_id: - raise ExtractorError('Failed to extract project ID for streaks.jp stream info') - - if not ref_id.startswith('ref:'): - ref_id = f'ref:{ref_id}' - - url = self.STREAKS_URL_TEMPLATE % (project_id, ref_id) - self.write_debug(f'Streaks URL: {url}') - - json_info = self._download_json( - url, - video_id, - 'Downloading streaks.jp streams video info', - headers={ - 'origin': 'https://tver.jp/', - 'referer': 'https://tver.jp/', - **self.geo_verification_headers(), - }, - ) - - sources = traverse_obj(json_info, ('sources'), default=[]) - - formats = [] - subtitles = {} - - for item in sources: - m3u8_url = traverse_obj(item, ('src'), default=None) - if not m3u8_url: + # XXX: Remove everything below and subclass TVerIE from StreaksBaseIE when #12679 is merged + _API_URL_TEMPLATE = 'https://{}.api.streaks.jp/v1/projects/{}/medias/{}{}' + + def _extract_from_streaks_api(self, project_id, media_id, headers=None, query=None, ssai=False): + try: + response = self._download_json( + self._API_URL_TEMPLATE.format('playback', project_id, media_id, ''), + media_id, 'Downloading streaks playback API JSON', + headers={ + 'Accept': 'application/json', + 'Origin': 'https://players.streaks.jp', + **self.geo_verification_headers(), + **(headers or {}), + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status in {403, 404}: + error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False) + message = traverse_obj(error, ('message', {str})) + code = traverse_obj(error, ('code', {str})) + if code == 'REQUEST_FAILED': + self.raise_geo_restricted(message, countries=self._GEO_COUNTRIES) + elif code == 'MEDIA_NOT_FOUND': + raise ExtractorError(message, expected=True) + elif code or message: + raise ExtractorError(join_nonempty(code, message, delim=': ')) + raise + + streaks_id = response['id'] + live_status = { + 'clip': 'was_live', + 'file': 'not_live', + 'linear': 'is_live', + 'live': 'is_live', + }.get(response.get('type')) + + audio_quality_func = qualities(('1', '0')) + formats, subtitles = [], {} + drm_formats = False + + for source in traverse_obj(response, ('sources', lambda _, v: v['src'])): + if source.get('key_systems'): + drm_formats = True continue - item_formats, item_subtitles = self._extract_m3u8_formats_and_subtitles( - m3u8_url, - video_id, - 'mp4', - m3u8_id='hls', - headers={'origin': 'https://tver.jp/', 'referer': 'https://tver.jp/'}, - note='Downloading streaks.jp m3u8 information', - ) - - build_qualities = [] - - for f in item_formats: - # hls-ts_AUDIO-1_1-pro_105ba7d8d0f4452bb452acf09466b2a5 - if mobj := re.search(r'hls-ts_AUDIO-(.+?)-', f['format_id']): - build_qualities.append(mobj.group(1)) - - if len(build_qualities) > 0: - # it seems best quality is 0_X. see ref for more info - # ref: https://github.com/yt-dlp/yt-dlp/issues/12643#issuecomment-2745263178 - build_qualities.sort(key=lambda x: int(x.split('_')[0]), reverse=True) - quality = qualities(tuple(build_qualities)) - - for f in item_formats: - if mobj := re.search(r'hls-ts_AUDIO-(.+?)-', f['format_id']): - f['quality'] = quality(mobj.group(1)) - - if len(item_formats) > 0: - formats.extend(item_formats) + src_url = source['src'] + is_live = live_status == 'is_live' + ext = mimetype2ext(source.get('type')) + + if ext == 'm3u8': + if is_live and ssai: + session_params = traverse_obj( + self._download_json( + self._API_URL_TEMPLATE.format('ssai', project_id, streaks_id, '/ssai/session'), + media_id, 'Downloading session parameters', + headers={'Content-Type': 'application/json'}, # XXX: geo_verification_headers ? + data=json.dumps({'id': source['id']}).encode()), + (0, 'query', {urllib.parse.parse_qs})) + src_url = update_url_query(src_url, session_params) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, media_id, 'mp4', m3u8_id='hls', + fatal=False, live=is_live, query=query) + + for fmt in traverse_obj(fmts, lambda _, v: v['vcodec'] == 'none'): + if mobj := re.match(r'hls-[a-z]+_AUDIO-(?P\d)_\d+-', fmt['format_id']): + fmt['quality'] = audio_quality_func(mobj.group('quality')) + + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + src_url, media_id, mpd_id='dash', fatal=False) - if len(item_subtitles) > 0: - subtitles.update(item_subtitles) - - if len(formats) < 1: - raise ExtractorError('Failed to extract any m3u8 streams from streaks.jp video info') - - result.update( - { - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - }, - ) - - duration = float_or_none(json_info.get('duration'), 1000) - if duration: - result['duration'] = duration + else: + self.report_warning(f'Unsupported stream type: {ext}') + continue - return result + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + if not formats and drm_formats: + self.report_drm(media_id) + + self._remove_duplicate_formats(formats) + + for subs in traverse_obj(response, ( + 'tracks', lambda _, v: v['kind'] in ('subtitles', 'captions') and url_or_none(v['src']), + )): + lang = traverse_obj(subs, ('srclang', {str.lower})) or 'ja' + subtitles.setdefault(lang, []).append({'url': subs['src']}) + + return { + 'id': streaks_id, + 'display_id': media_id, + 'channel_id': project_id, + 'formats': formats, + 'subtitles': subtitles, + 'live_status': live_status, + **traverse_obj(response, { + 'channel_id': ('project_id', {str}), + 'uploader_id': ('profile', {str}), + 'title': ('name', {str}), + 'description': ('description', {str}, filter), + 'duration': ('duration', {float_or_none}), + 'tags': ('tags', ..., {str}), + 'thumbnails': (('poster', 'thumbnail'), 'src', {'url': {url_or_none}}), + 'timestamp': ('created_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + }), + }