diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index 738ce1d81c..f83f5b9f92 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -1,123 +1,160 @@ import datetime as dt -import json import re -import urllib.parse from .common import InfoExtractor -from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, int_or_none, join_nonempty, - mimetype2ext, - parse_iso8601, qualities, + require, smuggle_url, str_or_none, strip_or_none, + traverse_obj, update_url_query, - url_or_none, ) -from ..utils.traversal import require, traverse_obj class TVerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature)/)+(?P[a-zA-Z0-9]+)' _GEO_COUNTRIES = ['JP'] _GEO_BYPASS = False - _TESTS = [{ - 'skip': 'videos are only available for 7 days', - 'url': 'https://tver.jp/episodes/ep83nf3w4p', - 'info_dict': { - 'title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'description': 'md5:dc2c06b6acc23f1e7c730c513737719b', - 'series': '家事ヤロウ!!!', - 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'channel': 'テレビ朝日', - 'id': 'ep83nf3w4p', - 'ext': 'mp4', + _TESTS = [ + { + 'skip': 'videos are only available for 7 days', + 'url': 'https://tver.jp/episodes/ep83nf3w4p', + 'info_dict': { + 'title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'description': 'md5:dc2c06b6acc23f1e7c730c513737719b', + 'series': '家事ヤロウ!!!', + 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'channel': 'テレビ朝日', + 'id': 'ep83nf3w4p', + 'ext': 'mp4', + }, }, - }, { - 'url': 'https://tver.jp/corner/f0103888', - 'only_matching': True, - }, { - 'url': 'https://tver.jp/lp/f0033031', - 'only_matching': True, - }, { - 'url': 'https://tver.jp/series/srtxft431v', - 'info_dict': { - 'id': 'srtxft431v', - 'title': '名探偵コナン', + { + 'url': 'https://tver.jp/corner/f0103888', + 'only_matching': True, }, - 'playlist': [ - { - 'md5': '779ffd97493ed59b0a6277ea726b389e', - 'info_dict': { - 'id': 'ref:conan-1137-241005', - 'ext': 'mp4', - 'title': '名探偵コナン #1137「行列店、味変の秘密」', - 'uploader_id': '5330942432001', - 'tags': [], - 'channel': '読売テレビ', - 'series': '名探偵コナン', - 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', - 'episode': '#1137「行列店、味変の秘密」', - 'duration': 1469.077, - 'timestamp': 1728030405, - 'upload_date': '20241004', - 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', - 'thumbnail': r're:https://.+\.jpg', + { + 'url': 'https://tver.jp/lp/f0033031', + 'only_matching': True, + }, + { + 'url': 'https://tver.jp/series/srtxft431v', + 'info_dict': { + 'id': 'srtxft431v', + 'title': '名探偵コナン', + }, + 'playlist': [ + { + 'md5': '779ffd97493ed59b0a6277ea726b389e', + 'info_dict': { + 'id': 'ref:conan-1137-241005', + 'ext': 'mp4', + 'title': '名探偵コナン #1137「行列店、味変の秘密」', + 'uploader_id': '5330942432001', + 'tags': [], + 'channel': '読売テレビ', + 'series': '名探偵コナン', + 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', + 'episode': '#1137「行列店、味変の秘密」', + 'duration': 1469.077, + 'timestamp': 1728030405, + 'upload_date': '20241004', + 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', + 'thumbnail': r're:https://.+\.jpg', + }, }, - }], - }, { - 'url': 'https://tver.jp/series/sru35hwdd2', - 'info_dict': { - 'id': 'sru35hwdd2', - 'title': '神回だけ見せます!', + ], + }, + { + 'url': 'https://tver.jp/series/sru35hwdd2', + 'info_dict': { + 'id': 'sru35hwdd2', + 'title': '神回だけ見せます!', + }, + 'playlist_count': 11, }, - 'playlist_count': 11, - }, { - 'url': 'https://tver.jp/series/srkq2shp9d', - 'only_matching': True, - }] + { + 'url': 'https://tver.jp/series/srkq2shp9d', + 'only_matching': True, + }, + ] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + STREAKS_URL_TEMPLATE = 'https://playback.api.streaks.jp/v1/projects/%s/medias/%s' + _HEADERS = { 'x-tver-platform-type': 'web', - 'Origin': 'https://tver.jp', - 'Referer': 'https://tver.jp/', + 'origin': 'https://tver.jp/', + 'referer': 'https://tver.jp/', } _PLATFORM_QUERY = {} def _real_initialize(self): session_info = self._download_json( 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', - None, 'Creating session', data=b'device_type=pc') - self._PLATFORM_QUERY = traverse_obj(session_info, ('result', { - 'platform_uid': 'platform_uid', - 'platform_token': 'platform_token', - })) + None, + 'Creating session', + data=b'device_type=pc', + ) + self._PLATFORM_QUERY = traverse_obj( + session_info, + ( + 'result', + { + 'platform_uid': 'platform_uid', + 'platform_token': 'platform_token', + }, + ), + ) def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None): return self._download_json( - f'https://platform-api.tver.jp/service/api/{path}', video_id, note, - fatal=fatal, headers=self._HEADERS, query={ + f'https://platform-api.tver.jp/service/api/{path}', + video_id, + note, + fatal=fatal, + headers=self._HEADERS, + query={ **self._PLATFORM_QUERY, **(query or {}), - }) + }, + ) def _yield_episode_ids_for_series(self, series_id): seasons_info = self._download_json( f'https://service-api.tver.jp/api/v1/callSeriesSeasons/{series_id}', - series_id, 'Downloading seasons info', headers=self._HEADERS) + series_id, + 'Downloading seasons info', + headers=self._HEADERS, + ) for season_id in traverse_obj( - seasons_info, ('result', 'contents', lambda _, v: v['type'] == 'season', 'content', 'id', {str})): + seasons_info, + ('result', 'contents', lambda _, v: v['type'] == 'season', 'content', 'id', {str}), + ): episodes_info = self._call_platform_api( - f'v1/callSeasonEpisodes/{season_id}', series_id, f'Downloading season {season_id} episodes info') - yield from traverse_obj(episodes_info, ( - 'result', 'contents', lambda _, v: v['type'] == 'episode', 'content', 'id', {str})) + f'v1/callSeasonEpisodes/{season_id}', + series_id, + f'Downloading season {season_id} episodes info', + ) + yield from traverse_obj( + episodes_info, + ( + 'result', + 'contents', + lambda _, v: v['type'] == 'episode', + 'content', + 'id', + {str}, + ), + ) def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') @@ -126,36 +163,49 @@ class TVerIE(InfoExtractor): raise ExtractorError(f'Invalid backend value: {backend}', expected=True) if video_type == 'series': - series_info = self._call_platform_api( - f'v2/callSeries/{video_id}', video_id, 'Downloading series info') + series_info = self._call_platform_api(f'v2/callSeries/{video_id}', video_id, 'Downloading series info') return self.playlist_from_matches( - self._yield_episode_ids_for_series(video_id), video_id, + self._yield_episode_ids_for_series(video_id), + video_id, traverse_obj(series_info, ('result', 'content', 'content', 'title', {str})), - ie=TVerIE, getter=lambda x: f'https://tver.jp/episodes/{x}') + ie=TVerIE, + getter=lambda x: f'https://tver.jp/episodes/{x}', + ) if video_type != 'episodes': webpage = self._download_webpage(url, video_id, note='Resolving to new URL') - video_id = self._match_id(self._search_regex( - (r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'), - webpage, 'url regex')) + video_id = self._match_id( + self._search_regex( + ( + r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', + r'&link=(https?://tver\.jp/[^?&]+)[?&]', + ), + webpage, + 'url regex', + ), + ) episode_info = self._call_platform_api( - f'v1/callEpisode/{video_id}', video_id, 'Downloading episode info', fatal=False, query={ + f'v1/callEpisode/{video_id}', + video_id, + 'Downloading episode info', + fatal=False, + query={ 'require_data': 'mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', - }) - episode_content = traverse_obj( - episode_info, ('result', 'episode', 'content')) or {} + }, + ) + episode_content = traverse_obj(episode_info, ('result', 'episode', 'content')) or {} version = traverse_obj(episode_content, ('version', {str_or_none}), default='5') + video_info = self._download_json( f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, 'Downloading video info', - query={'v': version}, headers={'Referer': 'https://tver.jp/'}) + query={'v': version}, headers={'Referer': 'https://tver.jp/'}, + ) episode = strip_or_none(episode_content.get('title')) series = str_or_none(episode_content.get('seriesTitle')) - title = ( - join_nonempty(series, episode, delim=' ') - or str_or_none(video_info.get('title'))) + title = join_nonempty(series, episode, delim=' ') or str_or_none(video_info.get('title')) provider = str_or_none(episode_content.get('productionProviderName')) onair_label = str_or_none(episode_content.get('broadcastDateLabel')) @@ -164,7 +214,8 @@ class TVerIE(InfoExtractor): 'id': quality, 'url': update_url_query( f'https://statics.tver.jp/images/content/thumbnail/episode/{quality}/{video_id}.jpg', - {'v': version}), + {'v': version}, + ), 'width': width, 'height': height, } @@ -214,14 +265,7 @@ class TVerIE(InfoExtractor): if not ref_id.startswith('ref:'): ref_id = f'ref:{ref_id}' - return { - **self._extract_from_streaks_api(video_info['streaks']['projectID'], ref_id, { - 'Origin': 'https://tver.jp', - 'Referer': 'https://tver.jp/', - }), - **metadata, - 'id': video_id, - } + return self._streaks_backend(metadata, video_info, video_id) def _format_broadcast_date(self, onair_label): """ @@ -241,8 +285,7 @@ class TVerIE(InfoExtractor): return {} match = re.search( - pattern=r'(?:(?P\d{4})年)|(?:(?P\d{1,2})\D(?P\d{1,2})\D)', - string=onair_label, + r'(?:(?P\d{4})年)|(?:(?P\d{1,2})\D(?P\d{1,2})\D)', onair_label, ) if not match: @@ -250,138 +293,92 @@ class TVerIE(InfoExtractor): data = {} - broadcast_date = match.groupdict() - - if broadcast_date.get('year'): - data['release_year'] = int(broadcast_date['year']) - - if broadcast_date.get('day') and broadcast_date.get('month'): - if 'release_year' in data: - year = data['release_year'] - else: - year = dt.datetime.now().year - dt_ = dt.datetime.strptime( - f"{year}-{broadcast_date['month']}-{broadcast_date['day']}", - '%Y-%m-%d', - ) - # if the date in the future, it means the broadcast date is in the previous year - # ref: https://github.com/yt-dlp/yt-dlp/pull/12282#issuecomment-2678132806 - if dt_ > dt.datetime.now(): - year -= 1 - - data['release_timestamp'] = dt.datetime( - year=year, - month=int(broadcast_date['month']), - day=int(broadcast_date['day']), - ).timestamp() + broadcast_date_info = match.groupdict() + data = { + 'release_year': int_or_none(broadcast_date_info.get('year')), + } + day, month = [int_or_none(broadcast_date_info.get(key)) for key in ('day', 'month')] + if day and month: + year = data.get('release_year') or dt.datetime.now().year + dt_ = dt.datetime.strptime(f'{year}-{month}-{day}', '%Y-%m-%d') + # If the date is in the future, it means the broadcast date is in the previous year + # Ref: https://github.com/yt-dlp/yt-dlp/pull/12282#issuecomment-2678132806 + if dt_ > dt.datetime.now(): + year -= 1 + data['release_timestamp'] = dt.datetime(year=year, month=month, day=day).timestamp() return data - # XXX: Remove everything below and subclass TVerIE from StreaksBaseIE when #12679 is merged - _API_URL_TEMPLATE = 'https://{}.api.streaks.jp/v1/projects/{}/medias/{}{}' - - def _extract_from_streaks_api(self, project_id, media_id, headers=None, query=None, ssai=False): - try: - response = self._download_json( - self._API_URL_TEMPLATE.format('playback', project_id, media_id, ''), - media_id, 'Downloading streaks playback API JSON', - headers={ - 'Accept': 'application/json', - 'Origin': 'https://players.streaks.jp', - **self.geo_verification_headers(), - **(headers or {}), - }) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status in {403, 404}: - error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False) - message = traverse_obj(error, ('message', {str})) - code = traverse_obj(error, ('code', {str})) - if code == 'REQUEST_FAILED': - self.raise_geo_restricted(message, countries=self._GEO_COUNTRIES) - elif code == 'MEDIA_NOT_FOUND': - raise ExtractorError(message, expected=True) - elif code or message: - raise ExtractorError(join_nonempty(code, message, delim=': ')) - raise - - streaks_id = response['id'] - live_status = { - 'clip': 'was_live', - 'file': 'not_live', - 'linear': 'is_live', - 'live': 'is_live', - }.get(response.get('type')) - - audio_quality_func = qualities(('1', '0')) - formats, subtitles = [], {} - drm_formats = False - - for source in traverse_obj(response, ('sources', lambda _, v: v['src'])): - if source.get('key_systems'): - drm_formats = True - continue + def _streaks_backend(self, result, video_info, video_id): + self.write_debug('Using streaks.jp backend') + + ref_id = traverse_obj(video_info, ('streaks', 'videoRefID'), get_all=False) + project_id = traverse_obj(video_info, ('streaks', 'projectID'), get_all=False) - src_url = source['src'] - is_live = live_status == 'is_live' - ext = mimetype2ext(source.get('type')) - - if ext == 'm3u8': - if is_live and ssai: - session_params = traverse_obj( - self._download_json( - self._API_URL_TEMPLATE.format('ssai', project_id, streaks_id, '/ssai/session'), - media_id, 'Downloading session parameters', - headers={'Content-Type': 'application/json'}, # XXX: geo_verification_headers ? - data=json.dumps({'id': source['id']}).encode()), - (0, 'query', {urllib.parse.parse_qs})) - src_url = update_url_query(src_url, session_params) - - fmts, subs = self._extract_m3u8_formats_and_subtitles( - src_url, media_id, 'mp4', m3u8_id='hls', - fatal=False, live=is_live, query=query) - - for fmt in traverse_obj(fmts, lambda _, v: v['vcodec'] == 'none'): - if mobj := re.match(r'hls-[a-z]+_AUDIO-(?P\d)_\d+-', fmt['format_id']): - fmt['quality'] = audio_quality_func(mobj.group('quality')) - - elif ext == 'mpd': - fmts, subs = self._extract_mpd_formats_and_subtitles( - src_url, media_id, mpd_id='dash', fatal=False) - - else: - self.report_warning(f'Unsupported stream type: {ext}') + if not ref_id: + raise ExtractorError('Failed to extract reference ID for streaks.jp stream info') + + if not project_id: + raise ExtractorError('Failed to extract project ID for streaks.jp stream info') + + if not ref_id.startswith('ref:'): + ref_id = f'ref:{ref_id}' + + url = self.STREAKS_URL_TEMPLATE % (project_id, ref_id) + self.write_debug(f'Streaks URL: {url}') + + json_info = self._download_json( + url, + video_id, + 'Downloading streaks.jp streams video info', + headers={ + 'origin': 'https://tver.jp/', + 'referer': 'https://tver.jp/', + **self.geo_verification_headers(), + }, + ) + + formats = [] + subtitles = {} + audio_quality_func = qualities(('0', '1', '2')) + + for item in traverse_obj(json_info, ('sources'), default=[]): + m3u8_url = traverse_obj(item, ('src'), default=None) + if not m3u8_url: continue - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) - - if not formats and drm_formats: - self.report_drm(media_id) - - self._remove_duplicate_formats(formats) - - for subs in traverse_obj(response, ( - 'tracks', lambda _, v: v['kind'] in ('subtitles', 'captions') and url_or_none(v['src']), - )): - lang = traverse_obj(subs, ('srclang', {str.lower})) or 'ja' - subtitles.setdefault(lang, []).append({'url': subs['src']}) - - return { - 'id': streaks_id, - 'display_id': media_id, - 'channel_id': project_id, - 'formats': formats, - 'subtitles': subtitles, - 'live_status': live_status, - **traverse_obj(response, { - 'channel_id': ('project_id', {str}), - 'uploader_id': ('profile', {str}), - 'title': ('name', {str}), - 'description': ('description', {str}, filter), - 'duration': ('duration', {float_or_none}), - 'tags': ('tags', ..., {str}), - 'thumbnails': (('poster', 'thumbnail'), 'src', {'url': {url_or_none}}), - 'timestamp': ('created_at', {parse_iso8601}), - 'modified_timestamp': ('updated_at', {parse_iso8601}), - }), - } + item_formats, item_subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, + video_id, + 'mp4', + m3u8_id='hls', + headers={'origin': 'https://tver.jp/', 'referer': 'https://tver.jp/'}, + note='Downloading streaks.jp m3u8 information', + ) + + for fmt in item_formats: + if mobj := re.match(r'hls-\w*?(?i:audio)-(?P\d)(?:_(?P\d))?', fmt['format_id']): + fmt['quality'] = audio_quality_func(mobj.group('qual')) * ((-1) ** bool(mobj.group('sub_qual'))) + + if len(item_formats) > 0: + formats.extend(item_formats) + + if len(item_subtitles) > 0: + subtitles.update(item_subtitles) + + if len(formats) < 1: + raise ExtractorError('Failed to extract any m3u8 streams from streaks.jp video info') + + result.update( + { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + }, + ) + + duration = float_or_none(json_info.get('duration'), 1000) + if duration: + result['duration'] = duration + + return result